Merge trunk version 195707 into gupc branch.
[official-gcc.git] / gcc / config / i386 / i386.c
blobe2f44e4382a35a66da6a32f670a9e926655eb1a7
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2013 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tm.h"
24 #include "rtl.h"
25 #include "tree.h"
26 #include "tm_p.h"
27 #include "regs.h"
28 #include "hard-reg-set.h"
29 #include "insn-config.h"
30 #include "conditions.h"
31 #include "output.h"
32 #include "insn-codes.h"
33 #include "insn-attr.h"
34 #include "flags.h"
35 #include "except.h"
36 #include "function.h"
37 #include "recog.h"
38 #include "expr.h"
39 #include "optabs.h"
40 #include "diagnostic-core.h"
41 #include "toplev.h"
42 #include "basic-block.h"
43 #include "ggc.h"
44 #include "target.h"
45 #include "target-def.h"
46 #include "common/common-target.h"
47 #include "langhooks.h"
48 #include "reload.h"
49 #include "cgraph.h"
50 #include "gimple.h"
51 #include "dwarf2.h"
52 #include "df.h"
53 #include "tm-constrs.h"
54 #include "params.h"
55 #include "cselib.h"
56 #include "debug.h"
57 #include "sched-int.h"
58 #include "sbitmap.h"
59 #include "fibheap.h"
60 #include "opts.h"
61 #include "diagnostic.h"
62 #include "dumpfile.h"
63 #include "tree-pass.h"
64 #include "tree-flow.h"
66 static rtx legitimize_dllimport_symbol (rtx, bool);
68 #ifndef CHECK_STACK_LIMIT
69 #define CHECK_STACK_LIMIT (-1)
70 #endif
72 /* Return index of given mode in mult and division cost tables. */
73 #define MODE_INDEX(mode) \
74 ((mode) == QImode ? 0 \
75 : (mode) == HImode ? 1 \
76 : (mode) == SImode ? 2 \
77 : (mode) == DImode ? 3 \
78 : 4)
80 /* Processor costs (relative to an add) */
81 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
82 #define COSTS_N_BYTES(N) ((N) * 2)
84 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
86 const
87 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
88 COSTS_N_BYTES (2), /* cost of an add instruction */
89 COSTS_N_BYTES (3), /* cost of a lea instruction */
90 COSTS_N_BYTES (2), /* variable shift costs */
91 COSTS_N_BYTES (3), /* constant shift costs */
92 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
93 COSTS_N_BYTES (3), /* HI */
94 COSTS_N_BYTES (3), /* SI */
95 COSTS_N_BYTES (3), /* DI */
96 COSTS_N_BYTES (5)}, /* other */
97 0, /* cost of multiply per each bit set */
98 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
99 COSTS_N_BYTES (3), /* HI */
100 COSTS_N_BYTES (3), /* SI */
101 COSTS_N_BYTES (3), /* DI */
102 COSTS_N_BYTES (5)}, /* other */
103 COSTS_N_BYTES (3), /* cost of movsx */
104 COSTS_N_BYTES (3), /* cost of movzx */
105 0, /* "large" insn */
106 2, /* MOVE_RATIO */
107 2, /* cost for loading QImode using movzbl */
108 {2, 2, 2}, /* cost of loading integer registers
109 in QImode, HImode and SImode.
110 Relative to reg-reg move (2). */
111 {2, 2, 2}, /* cost of storing integer registers */
112 2, /* cost of reg,reg fld/fst */
113 {2, 2, 2}, /* cost of loading fp registers
114 in SFmode, DFmode and XFmode */
115 {2, 2, 2}, /* cost of storing fp registers
116 in SFmode, DFmode and XFmode */
117 3, /* cost of moving MMX register */
118 {3, 3}, /* cost of loading MMX registers
119 in SImode and DImode */
120 {3, 3}, /* cost of storing MMX registers
121 in SImode and DImode */
122 3, /* cost of moving SSE register */
123 {3, 3, 3}, /* cost of loading SSE registers
124 in SImode, DImode and TImode */
125 {3, 3, 3}, /* cost of storing SSE registers
126 in SImode, DImode and TImode */
127 3, /* MMX or SSE register to integer */
128 0, /* size of l1 cache */
129 0, /* size of l2 cache */
130 0, /* size of prefetch block */
131 0, /* number of parallel prefetches */
132 2, /* Branch cost */
133 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
134 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
135 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
136 COSTS_N_BYTES (2), /* cost of FABS instruction. */
137 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
138 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
139 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
140 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}},
141 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
142 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}},
143 1, /* scalar_stmt_cost. */
144 1, /* scalar load_cost. */
145 1, /* scalar_store_cost. */
146 1, /* vec_stmt_cost. */
147 1, /* vec_to_scalar_cost. */
148 1, /* scalar_to_vec_cost. */
149 1, /* vec_align_load_cost. */
150 1, /* vec_unalign_load_cost. */
151 1, /* vec_store_cost. */
152 1, /* cond_taken_branch_cost. */
153 1, /* cond_not_taken_branch_cost. */
156 /* Processor costs (relative to an add) */
157 static const
158 struct processor_costs i386_cost = { /* 386 specific costs */
159 COSTS_N_INSNS (1), /* cost of an add instruction */
160 COSTS_N_INSNS (1), /* cost of a lea instruction */
161 COSTS_N_INSNS (3), /* variable shift costs */
162 COSTS_N_INSNS (2), /* constant shift costs */
163 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
164 COSTS_N_INSNS (6), /* HI */
165 COSTS_N_INSNS (6), /* SI */
166 COSTS_N_INSNS (6), /* DI */
167 COSTS_N_INSNS (6)}, /* other */
168 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
169 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
170 COSTS_N_INSNS (23), /* HI */
171 COSTS_N_INSNS (23), /* SI */
172 COSTS_N_INSNS (23), /* DI */
173 COSTS_N_INSNS (23)}, /* other */
174 COSTS_N_INSNS (3), /* cost of movsx */
175 COSTS_N_INSNS (2), /* cost of movzx */
176 15, /* "large" insn */
177 3, /* MOVE_RATIO */
178 4, /* cost for loading QImode using movzbl */
179 {2, 4, 2}, /* cost of loading integer registers
180 in QImode, HImode and SImode.
181 Relative to reg-reg move (2). */
182 {2, 4, 2}, /* cost of storing integer registers */
183 2, /* cost of reg,reg fld/fst */
184 {8, 8, 8}, /* cost of loading fp registers
185 in SFmode, DFmode and XFmode */
186 {8, 8, 8}, /* cost of storing fp registers
187 in SFmode, DFmode and XFmode */
188 2, /* cost of moving MMX register */
189 {4, 8}, /* cost of loading MMX registers
190 in SImode and DImode */
191 {4, 8}, /* cost of storing MMX registers
192 in SImode and DImode */
193 2, /* cost of moving SSE register */
194 {4, 8, 16}, /* cost of loading SSE registers
195 in SImode, DImode and TImode */
196 {4, 8, 16}, /* cost of storing SSE registers
197 in SImode, DImode and TImode */
198 3, /* MMX or SSE register to integer */
199 0, /* size of l1 cache */
200 0, /* size of l2 cache */
201 0, /* size of prefetch block */
202 0, /* number of parallel prefetches */
203 1, /* Branch cost */
204 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
205 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
206 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
207 COSTS_N_INSNS (22), /* cost of FABS instruction. */
208 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
209 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
210 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
211 DUMMY_STRINGOP_ALGS},
212 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
213 DUMMY_STRINGOP_ALGS},
214 1, /* scalar_stmt_cost. */
215 1, /* scalar load_cost. */
216 1, /* scalar_store_cost. */
217 1, /* vec_stmt_cost. */
218 1, /* vec_to_scalar_cost. */
219 1, /* scalar_to_vec_cost. */
220 1, /* vec_align_load_cost. */
221 2, /* vec_unalign_load_cost. */
222 1, /* vec_store_cost. */
223 3, /* cond_taken_branch_cost. */
224 1, /* cond_not_taken_branch_cost. */
227 static const
228 struct processor_costs i486_cost = { /* 486 specific costs */
229 COSTS_N_INSNS (1), /* cost of an add instruction */
230 COSTS_N_INSNS (1), /* cost of a lea instruction */
231 COSTS_N_INSNS (3), /* variable shift costs */
232 COSTS_N_INSNS (2), /* constant shift costs */
233 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
234 COSTS_N_INSNS (12), /* HI */
235 COSTS_N_INSNS (12), /* SI */
236 COSTS_N_INSNS (12), /* DI */
237 COSTS_N_INSNS (12)}, /* other */
238 1, /* cost of multiply per each bit set */
239 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
240 COSTS_N_INSNS (40), /* HI */
241 COSTS_N_INSNS (40), /* SI */
242 COSTS_N_INSNS (40), /* DI */
243 COSTS_N_INSNS (40)}, /* other */
244 COSTS_N_INSNS (3), /* cost of movsx */
245 COSTS_N_INSNS (2), /* cost of movzx */
246 15, /* "large" insn */
247 3, /* MOVE_RATIO */
248 4, /* cost for loading QImode using movzbl */
249 {2, 4, 2}, /* cost of loading integer registers
250 in QImode, HImode and SImode.
251 Relative to reg-reg move (2). */
252 {2, 4, 2}, /* cost of storing integer registers */
253 2, /* cost of reg,reg fld/fst */
254 {8, 8, 8}, /* cost of loading fp registers
255 in SFmode, DFmode and XFmode */
256 {8, 8, 8}, /* cost of storing fp registers
257 in SFmode, DFmode and XFmode */
258 2, /* cost of moving MMX register */
259 {4, 8}, /* cost of loading MMX registers
260 in SImode and DImode */
261 {4, 8}, /* cost of storing MMX registers
262 in SImode and DImode */
263 2, /* cost of moving SSE register */
264 {4, 8, 16}, /* cost of loading SSE registers
265 in SImode, DImode and TImode */
266 {4, 8, 16}, /* cost of storing SSE registers
267 in SImode, DImode and TImode */
268 3, /* MMX or SSE register to integer */
269 4, /* size of l1 cache. 486 has 8kB cache
270 shared for code and data, so 4kB is
271 not really precise. */
272 4, /* size of l2 cache */
273 0, /* size of prefetch block */
274 0, /* number of parallel prefetches */
275 1, /* Branch cost */
276 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
277 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
278 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
279 COSTS_N_INSNS (3), /* cost of FABS instruction. */
280 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
281 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
282 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
283 DUMMY_STRINGOP_ALGS},
284 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
285 DUMMY_STRINGOP_ALGS},
286 1, /* scalar_stmt_cost. */
287 1, /* scalar load_cost. */
288 1, /* scalar_store_cost. */
289 1, /* vec_stmt_cost. */
290 1, /* vec_to_scalar_cost. */
291 1, /* scalar_to_vec_cost. */
292 1, /* vec_align_load_cost. */
293 2, /* vec_unalign_load_cost. */
294 1, /* vec_store_cost. */
295 3, /* cond_taken_branch_cost. */
296 1, /* cond_not_taken_branch_cost. */
299 static const
300 struct processor_costs pentium_cost = {
301 COSTS_N_INSNS (1), /* cost of an add instruction */
302 COSTS_N_INSNS (1), /* cost of a lea instruction */
303 COSTS_N_INSNS (4), /* variable shift costs */
304 COSTS_N_INSNS (1), /* constant shift costs */
305 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
306 COSTS_N_INSNS (11), /* HI */
307 COSTS_N_INSNS (11), /* SI */
308 COSTS_N_INSNS (11), /* DI */
309 COSTS_N_INSNS (11)}, /* other */
310 0, /* cost of multiply per each bit set */
311 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
312 COSTS_N_INSNS (25), /* HI */
313 COSTS_N_INSNS (25), /* SI */
314 COSTS_N_INSNS (25), /* DI */
315 COSTS_N_INSNS (25)}, /* other */
316 COSTS_N_INSNS (3), /* cost of movsx */
317 COSTS_N_INSNS (2), /* cost of movzx */
318 8, /* "large" insn */
319 6, /* MOVE_RATIO */
320 6, /* cost for loading QImode using movzbl */
321 {2, 4, 2}, /* cost of loading integer registers
322 in QImode, HImode and SImode.
323 Relative to reg-reg move (2). */
324 {2, 4, 2}, /* cost of storing integer registers */
325 2, /* cost of reg,reg fld/fst */
326 {2, 2, 6}, /* cost of loading fp registers
327 in SFmode, DFmode and XFmode */
328 {4, 4, 6}, /* cost of storing fp registers
329 in SFmode, DFmode and XFmode */
330 8, /* cost of moving MMX register */
331 {8, 8}, /* cost of loading MMX registers
332 in SImode and DImode */
333 {8, 8}, /* cost of storing MMX registers
334 in SImode and DImode */
335 2, /* cost of moving SSE register */
336 {4, 8, 16}, /* cost of loading SSE registers
337 in SImode, DImode and TImode */
338 {4, 8, 16}, /* cost of storing SSE registers
339 in SImode, DImode and TImode */
340 3, /* MMX or SSE register to integer */
341 8, /* size of l1 cache. */
342 8, /* size of l2 cache */
343 0, /* size of prefetch block */
344 0, /* number of parallel prefetches */
345 2, /* Branch cost */
346 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
347 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
348 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
349 COSTS_N_INSNS (1), /* cost of FABS instruction. */
350 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
351 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
352 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
353 DUMMY_STRINGOP_ALGS},
354 {{libcall, {{-1, rep_prefix_4_byte, false}}},
355 DUMMY_STRINGOP_ALGS},
356 1, /* scalar_stmt_cost. */
357 1, /* scalar load_cost. */
358 1, /* scalar_store_cost. */
359 1, /* vec_stmt_cost. */
360 1, /* vec_to_scalar_cost. */
361 1, /* scalar_to_vec_cost. */
362 1, /* vec_align_load_cost. */
363 2, /* vec_unalign_load_cost. */
364 1, /* vec_store_cost. */
365 3, /* cond_taken_branch_cost. */
366 1, /* cond_not_taken_branch_cost. */
369 static const
370 struct processor_costs pentiumpro_cost = {
371 COSTS_N_INSNS (1), /* cost of an add instruction */
372 COSTS_N_INSNS (1), /* cost of a lea instruction */
373 COSTS_N_INSNS (1), /* variable shift costs */
374 COSTS_N_INSNS (1), /* constant shift costs */
375 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
376 COSTS_N_INSNS (4), /* HI */
377 COSTS_N_INSNS (4), /* SI */
378 COSTS_N_INSNS (4), /* DI */
379 COSTS_N_INSNS (4)}, /* other */
380 0, /* cost of multiply per each bit set */
381 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
382 COSTS_N_INSNS (17), /* HI */
383 COSTS_N_INSNS (17), /* SI */
384 COSTS_N_INSNS (17), /* DI */
385 COSTS_N_INSNS (17)}, /* other */
386 COSTS_N_INSNS (1), /* cost of movsx */
387 COSTS_N_INSNS (1), /* cost of movzx */
388 8, /* "large" insn */
389 6, /* MOVE_RATIO */
390 2, /* cost for loading QImode using movzbl */
391 {4, 4, 4}, /* cost of loading integer registers
392 in QImode, HImode and SImode.
393 Relative to reg-reg move (2). */
394 {2, 2, 2}, /* cost of storing integer registers */
395 2, /* cost of reg,reg fld/fst */
396 {2, 2, 6}, /* cost of loading fp registers
397 in SFmode, DFmode and XFmode */
398 {4, 4, 6}, /* cost of storing fp registers
399 in SFmode, DFmode and XFmode */
400 2, /* cost of moving MMX register */
401 {2, 2}, /* cost of loading MMX registers
402 in SImode and DImode */
403 {2, 2}, /* cost of storing MMX registers
404 in SImode and DImode */
405 2, /* cost of moving SSE register */
406 {2, 2, 8}, /* cost of loading SSE registers
407 in SImode, DImode and TImode */
408 {2, 2, 8}, /* cost of storing SSE registers
409 in SImode, DImode and TImode */
410 3, /* MMX or SSE register to integer */
411 8, /* size of l1 cache. */
412 256, /* size of l2 cache */
413 32, /* size of prefetch block */
414 6, /* number of parallel prefetches */
415 2, /* Branch cost */
416 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
417 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
418 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
419 COSTS_N_INSNS (2), /* cost of FABS instruction. */
420 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
421 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
422 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
423 (we ensure the alignment). For small blocks inline loop is still a
424 noticeable win, for bigger blocks either rep movsl or rep movsb is
425 way to go. Rep movsb has apparently more expensive startup time in CPU,
426 but after 4K the difference is down in the noise. */
427 {{rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
428 {8192, rep_prefix_4_byte, false},
429 {-1, rep_prefix_1_byte, false}}},
430 DUMMY_STRINGOP_ALGS},
431 {{rep_prefix_4_byte, {{1024, unrolled_loop, false},
432 {8192, rep_prefix_4_byte, false},
433 {-1, libcall, false}}},
434 DUMMY_STRINGOP_ALGS},
435 1, /* scalar_stmt_cost. */
436 1, /* scalar load_cost. */
437 1, /* scalar_store_cost. */
438 1, /* vec_stmt_cost. */
439 1, /* vec_to_scalar_cost. */
440 1, /* scalar_to_vec_cost. */
441 1, /* vec_align_load_cost. */
442 2, /* vec_unalign_load_cost. */
443 1, /* vec_store_cost. */
444 3, /* cond_taken_branch_cost. */
445 1, /* cond_not_taken_branch_cost. */
448 static const
449 struct processor_costs geode_cost = {
450 COSTS_N_INSNS (1), /* cost of an add instruction */
451 COSTS_N_INSNS (1), /* cost of a lea instruction */
452 COSTS_N_INSNS (2), /* variable shift costs */
453 COSTS_N_INSNS (1), /* constant shift costs */
454 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
455 COSTS_N_INSNS (4), /* HI */
456 COSTS_N_INSNS (7), /* SI */
457 COSTS_N_INSNS (7), /* DI */
458 COSTS_N_INSNS (7)}, /* other */
459 0, /* cost of multiply per each bit set */
460 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
461 COSTS_N_INSNS (23), /* HI */
462 COSTS_N_INSNS (39), /* SI */
463 COSTS_N_INSNS (39), /* DI */
464 COSTS_N_INSNS (39)}, /* other */
465 COSTS_N_INSNS (1), /* cost of movsx */
466 COSTS_N_INSNS (1), /* cost of movzx */
467 8, /* "large" insn */
468 4, /* MOVE_RATIO */
469 1, /* cost for loading QImode using movzbl */
470 {1, 1, 1}, /* cost of loading integer registers
471 in QImode, HImode and SImode.
472 Relative to reg-reg move (2). */
473 {1, 1, 1}, /* cost of storing integer registers */
474 1, /* cost of reg,reg fld/fst */
475 {1, 1, 1}, /* cost of loading fp registers
476 in SFmode, DFmode and XFmode */
477 {4, 6, 6}, /* cost of storing fp registers
478 in SFmode, DFmode and XFmode */
480 1, /* cost of moving MMX register */
481 {1, 1}, /* cost of loading MMX registers
482 in SImode and DImode */
483 {1, 1}, /* cost of storing MMX registers
484 in SImode and DImode */
485 1, /* cost of moving SSE register */
486 {1, 1, 1}, /* cost of loading SSE registers
487 in SImode, DImode and TImode */
488 {1, 1, 1}, /* cost of storing SSE registers
489 in SImode, DImode and TImode */
490 1, /* MMX or SSE register to integer */
491 64, /* size of l1 cache. */
492 128, /* size of l2 cache. */
493 32, /* size of prefetch block */
494 1, /* number of parallel prefetches */
495 1, /* Branch cost */
496 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
497 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
498 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
499 COSTS_N_INSNS (1), /* cost of FABS instruction. */
500 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
501 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
502 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
503 DUMMY_STRINGOP_ALGS},
504 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
505 DUMMY_STRINGOP_ALGS},
506 1, /* scalar_stmt_cost. */
507 1, /* scalar load_cost. */
508 1, /* scalar_store_cost. */
509 1, /* vec_stmt_cost. */
510 1, /* vec_to_scalar_cost. */
511 1, /* scalar_to_vec_cost. */
512 1, /* vec_align_load_cost. */
513 2, /* vec_unalign_load_cost. */
514 1, /* vec_store_cost. */
515 3, /* cond_taken_branch_cost. */
516 1, /* cond_not_taken_branch_cost. */
519 static const
520 struct processor_costs k6_cost = {
521 COSTS_N_INSNS (1), /* cost of an add instruction */
522 COSTS_N_INSNS (2), /* cost of a lea instruction */
523 COSTS_N_INSNS (1), /* variable shift costs */
524 COSTS_N_INSNS (1), /* constant shift costs */
525 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
526 COSTS_N_INSNS (3), /* HI */
527 COSTS_N_INSNS (3), /* SI */
528 COSTS_N_INSNS (3), /* DI */
529 COSTS_N_INSNS (3)}, /* other */
530 0, /* cost of multiply per each bit set */
531 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
532 COSTS_N_INSNS (18), /* HI */
533 COSTS_N_INSNS (18), /* SI */
534 COSTS_N_INSNS (18), /* DI */
535 COSTS_N_INSNS (18)}, /* other */
536 COSTS_N_INSNS (2), /* cost of movsx */
537 COSTS_N_INSNS (2), /* cost of movzx */
538 8, /* "large" insn */
539 4, /* MOVE_RATIO */
540 3, /* cost for loading QImode using movzbl */
541 {4, 5, 4}, /* cost of loading integer registers
542 in QImode, HImode and SImode.
543 Relative to reg-reg move (2). */
544 {2, 3, 2}, /* cost of storing integer registers */
545 4, /* cost of reg,reg fld/fst */
546 {6, 6, 6}, /* cost of loading fp registers
547 in SFmode, DFmode and XFmode */
548 {4, 4, 4}, /* cost of storing fp registers
549 in SFmode, DFmode and XFmode */
550 2, /* cost of moving MMX register */
551 {2, 2}, /* cost of loading MMX registers
552 in SImode and DImode */
553 {2, 2}, /* cost of storing MMX registers
554 in SImode and DImode */
555 2, /* cost of moving SSE register */
556 {2, 2, 8}, /* cost of loading SSE registers
557 in SImode, DImode and TImode */
558 {2, 2, 8}, /* cost of storing SSE registers
559 in SImode, DImode and TImode */
560 6, /* MMX or SSE register to integer */
561 32, /* size of l1 cache. */
562 32, /* size of l2 cache. Some models
563 have integrated l2 cache, but
564 optimizing for k6 is not important
565 enough to worry about that. */
566 32, /* size of prefetch block */
567 1, /* number of parallel prefetches */
568 1, /* Branch cost */
569 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
570 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
571 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
572 COSTS_N_INSNS (2), /* cost of FABS instruction. */
573 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
574 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
575 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
576 DUMMY_STRINGOP_ALGS},
577 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
578 DUMMY_STRINGOP_ALGS},
579 1, /* scalar_stmt_cost. */
580 1, /* scalar load_cost. */
581 1, /* scalar_store_cost. */
582 1, /* vec_stmt_cost. */
583 1, /* vec_to_scalar_cost. */
584 1, /* scalar_to_vec_cost. */
585 1, /* vec_align_load_cost. */
586 2, /* vec_unalign_load_cost. */
587 1, /* vec_store_cost. */
588 3, /* cond_taken_branch_cost. */
589 1, /* cond_not_taken_branch_cost. */
592 static const
593 struct processor_costs athlon_cost = {
594 COSTS_N_INSNS (1), /* cost of an add instruction */
595 COSTS_N_INSNS (2), /* cost of a lea instruction */
596 COSTS_N_INSNS (1), /* variable shift costs */
597 COSTS_N_INSNS (1), /* constant shift costs */
598 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
599 COSTS_N_INSNS (5), /* HI */
600 COSTS_N_INSNS (5), /* SI */
601 COSTS_N_INSNS (5), /* DI */
602 COSTS_N_INSNS (5)}, /* other */
603 0, /* cost of multiply per each bit set */
604 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
605 COSTS_N_INSNS (26), /* HI */
606 COSTS_N_INSNS (42), /* SI */
607 COSTS_N_INSNS (74), /* DI */
608 COSTS_N_INSNS (74)}, /* other */
609 COSTS_N_INSNS (1), /* cost of movsx */
610 COSTS_N_INSNS (1), /* cost of movzx */
611 8, /* "large" insn */
612 9, /* MOVE_RATIO */
613 4, /* cost for loading QImode using movzbl */
614 {3, 4, 3}, /* cost of loading integer registers
615 in QImode, HImode and SImode.
616 Relative to reg-reg move (2). */
617 {3, 4, 3}, /* cost of storing integer registers */
618 4, /* cost of reg,reg fld/fst */
619 {4, 4, 12}, /* cost of loading fp registers
620 in SFmode, DFmode and XFmode */
621 {6, 6, 8}, /* cost of storing fp registers
622 in SFmode, DFmode and XFmode */
623 2, /* cost of moving MMX register */
624 {4, 4}, /* cost of loading MMX registers
625 in SImode and DImode */
626 {4, 4}, /* cost of storing MMX registers
627 in SImode and DImode */
628 2, /* cost of moving SSE register */
629 {4, 4, 6}, /* cost of loading SSE registers
630 in SImode, DImode and TImode */
631 {4, 4, 5}, /* cost of storing SSE registers
632 in SImode, DImode and TImode */
633 5, /* MMX or SSE register to integer */
634 64, /* size of l1 cache. */
635 256, /* size of l2 cache. */
636 64, /* size of prefetch block */
637 6, /* number of parallel prefetches */
638 5, /* Branch cost */
639 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
640 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
641 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
642 COSTS_N_INSNS (2), /* cost of FABS instruction. */
643 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
644 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
645 /* For some reason, Athlon deals better with REP prefix (relative to loops)
646 compared to K8. Alignment becomes important after 8 bytes for memcpy and
647 128 bytes for memset. */
648 {{libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
649 DUMMY_STRINGOP_ALGS},
650 {{libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
651 DUMMY_STRINGOP_ALGS},
652 1, /* scalar_stmt_cost. */
653 1, /* scalar load_cost. */
654 1, /* scalar_store_cost. */
655 1, /* vec_stmt_cost. */
656 1, /* vec_to_scalar_cost. */
657 1, /* scalar_to_vec_cost. */
658 1, /* vec_align_load_cost. */
659 2, /* vec_unalign_load_cost. */
660 1, /* vec_store_cost. */
661 3, /* cond_taken_branch_cost. */
662 1, /* cond_not_taken_branch_cost. */
665 static const
666 struct processor_costs k8_cost = {
667 COSTS_N_INSNS (1), /* cost of an add instruction */
668 COSTS_N_INSNS (2), /* cost of a lea instruction */
669 COSTS_N_INSNS (1), /* variable shift costs */
670 COSTS_N_INSNS (1), /* constant shift costs */
671 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
672 COSTS_N_INSNS (4), /* HI */
673 COSTS_N_INSNS (3), /* SI */
674 COSTS_N_INSNS (4), /* DI */
675 COSTS_N_INSNS (5)}, /* other */
676 0, /* cost of multiply per each bit set */
677 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
678 COSTS_N_INSNS (26), /* HI */
679 COSTS_N_INSNS (42), /* SI */
680 COSTS_N_INSNS (74), /* DI */
681 COSTS_N_INSNS (74)}, /* other */
682 COSTS_N_INSNS (1), /* cost of movsx */
683 COSTS_N_INSNS (1), /* cost of movzx */
684 8, /* "large" insn */
685 9, /* MOVE_RATIO */
686 4, /* cost for loading QImode using movzbl */
687 {3, 4, 3}, /* cost of loading integer registers
688 in QImode, HImode and SImode.
689 Relative to reg-reg move (2). */
690 {3, 4, 3}, /* cost of storing integer registers */
691 4, /* cost of reg,reg fld/fst */
692 {4, 4, 12}, /* cost of loading fp registers
693 in SFmode, DFmode and XFmode */
694 {6, 6, 8}, /* cost of storing fp registers
695 in SFmode, DFmode and XFmode */
696 2, /* cost of moving MMX register */
697 {3, 3}, /* cost of loading MMX registers
698 in SImode and DImode */
699 {4, 4}, /* cost of storing MMX registers
700 in SImode and DImode */
701 2, /* cost of moving SSE register */
702 {4, 3, 6}, /* cost of loading SSE registers
703 in SImode, DImode and TImode */
704 {4, 4, 5}, /* cost of storing SSE registers
705 in SImode, DImode and TImode */
706 5, /* MMX or SSE register to integer */
707 64, /* size of l1 cache. */
708 512, /* size of l2 cache. */
709 64, /* size of prefetch block */
710 /* New AMD processors never drop prefetches; if they cannot be performed
711 immediately, they are queued. We set number of simultaneous prefetches
712 to a large constant to reflect this (it probably is not a good idea not
713 to limit number of prefetches at all, as their execution also takes some
714 time). */
715 100, /* number of parallel prefetches */
716 3, /* Branch cost */
717 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
718 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
719 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
720 COSTS_N_INSNS (2), /* cost of FABS instruction. */
721 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
722 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
723 /* K8 has optimized REP instruction for medium sized blocks, but for very
724 small blocks it is better to use loop. For large blocks, libcall can
725 do nontemporary accesses and beat inline considerably. */
726 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
727 {-1, rep_prefix_4_byte, false}}},
728 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
729 {-1, libcall, false}}}},
730 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
731 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
732 {libcall, {{48, unrolled_loop, false},
733 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
734 4, /* scalar_stmt_cost. */
735 2, /* scalar load_cost. */
736 2, /* scalar_store_cost. */
737 5, /* vec_stmt_cost. */
738 0, /* vec_to_scalar_cost. */
739 2, /* scalar_to_vec_cost. */
740 2, /* vec_align_load_cost. */
741 3, /* vec_unalign_load_cost. */
742 3, /* vec_store_cost. */
743 3, /* cond_taken_branch_cost. */
744 2, /* cond_not_taken_branch_cost. */
747 struct processor_costs amdfam10_cost = {
748 COSTS_N_INSNS (1), /* cost of an add instruction */
749 COSTS_N_INSNS (2), /* cost of a lea instruction */
750 COSTS_N_INSNS (1), /* variable shift costs */
751 COSTS_N_INSNS (1), /* constant shift costs */
752 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
753 COSTS_N_INSNS (4), /* HI */
754 COSTS_N_INSNS (3), /* SI */
755 COSTS_N_INSNS (4), /* DI */
756 COSTS_N_INSNS (5)}, /* other */
757 0, /* cost of multiply per each bit set */
758 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
759 COSTS_N_INSNS (35), /* HI */
760 COSTS_N_INSNS (51), /* SI */
761 COSTS_N_INSNS (83), /* DI */
762 COSTS_N_INSNS (83)}, /* other */
763 COSTS_N_INSNS (1), /* cost of movsx */
764 COSTS_N_INSNS (1), /* cost of movzx */
765 8, /* "large" insn */
766 9, /* MOVE_RATIO */
767 4, /* cost for loading QImode using movzbl */
768 {3, 4, 3}, /* cost of loading integer registers
769 in QImode, HImode and SImode.
770 Relative to reg-reg move (2). */
771 {3, 4, 3}, /* cost of storing integer registers */
772 4, /* cost of reg,reg fld/fst */
773 {4, 4, 12}, /* cost of loading fp registers
774 in SFmode, DFmode and XFmode */
775 {6, 6, 8}, /* cost of storing fp registers
776 in SFmode, DFmode and XFmode */
777 2, /* cost of moving MMX register */
778 {3, 3}, /* cost of loading MMX registers
779 in SImode and DImode */
780 {4, 4}, /* cost of storing MMX registers
781 in SImode and DImode */
782 2, /* cost of moving SSE register */
783 {4, 4, 3}, /* cost of loading SSE registers
784 in SImode, DImode and TImode */
785 {4, 4, 5}, /* cost of storing SSE registers
786 in SImode, DImode and TImode */
787 3, /* MMX or SSE register to integer */
788 /* On K8:
789 MOVD reg64, xmmreg Double FSTORE 4
790 MOVD reg32, xmmreg Double FSTORE 4
791 On AMDFAM10:
792 MOVD reg64, xmmreg Double FADD 3
793 1/1 1/1
794 MOVD reg32, xmmreg Double FADD 3
795 1/1 1/1 */
796 64, /* size of l1 cache. */
797 512, /* size of l2 cache. */
798 64, /* size of prefetch block */
799 /* New AMD processors never drop prefetches; if they cannot be performed
800 immediately, they are queued. We set number of simultaneous prefetches
801 to a large constant to reflect this (it probably is not a good idea not
802 to limit number of prefetches at all, as their execution also takes some
803 time). */
804 100, /* number of parallel prefetches */
805 2, /* Branch cost */
806 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
807 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
808 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
809 COSTS_N_INSNS (2), /* cost of FABS instruction. */
810 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
811 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
813 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
814 very small blocks it is better to use loop. For large blocks, libcall can
815 do nontemporary accesses and beat inline considerably. */
816 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
817 {-1, rep_prefix_4_byte, false}}},
818 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
819 {-1, libcall, false}}}},
820 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
821 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
822 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
823 {-1, libcall, false}}}},
824 4, /* scalar_stmt_cost. */
825 2, /* scalar load_cost. */
826 2, /* scalar_store_cost. */
827 6, /* vec_stmt_cost. */
828 0, /* vec_to_scalar_cost. */
829 2, /* scalar_to_vec_cost. */
830 2, /* vec_align_load_cost. */
831 2, /* vec_unalign_load_cost. */
832 2, /* vec_store_cost. */
833 2, /* cond_taken_branch_cost. */
834 1, /* cond_not_taken_branch_cost. */
837 struct processor_costs bdver1_cost = {
838 COSTS_N_INSNS (1), /* cost of an add instruction */
839 COSTS_N_INSNS (1), /* cost of a lea instruction */
840 COSTS_N_INSNS (1), /* variable shift costs */
841 COSTS_N_INSNS (1), /* constant shift costs */
842 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
843 COSTS_N_INSNS (4), /* HI */
844 COSTS_N_INSNS (4), /* SI */
845 COSTS_N_INSNS (6), /* DI */
846 COSTS_N_INSNS (6)}, /* other */
847 0, /* cost of multiply per each bit set */
848 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
849 COSTS_N_INSNS (35), /* HI */
850 COSTS_N_INSNS (51), /* SI */
851 COSTS_N_INSNS (83), /* DI */
852 COSTS_N_INSNS (83)}, /* other */
853 COSTS_N_INSNS (1), /* cost of movsx */
854 COSTS_N_INSNS (1), /* cost of movzx */
855 8, /* "large" insn */
856 9, /* MOVE_RATIO */
857 4, /* cost for loading QImode using movzbl */
858 {5, 5, 4}, /* cost of loading integer registers
859 in QImode, HImode and SImode.
860 Relative to reg-reg move (2). */
861 {4, 4, 4}, /* cost of storing integer registers */
862 2, /* cost of reg,reg fld/fst */
863 {5, 5, 12}, /* cost of loading fp registers
864 in SFmode, DFmode and XFmode */
865 {4, 4, 8}, /* cost of storing fp registers
866 in SFmode, DFmode and XFmode */
867 2, /* cost of moving MMX register */
868 {4, 4}, /* cost of loading MMX registers
869 in SImode and DImode */
870 {4, 4}, /* cost of storing MMX registers
871 in SImode and DImode */
872 2, /* cost of moving SSE register */
873 {4, 4, 4}, /* cost of loading SSE registers
874 in SImode, DImode and TImode */
875 {4, 4, 4}, /* cost of storing SSE registers
876 in SImode, DImode and TImode */
877 2, /* MMX or SSE register to integer */
878 /* On K8:
879 MOVD reg64, xmmreg Double FSTORE 4
880 MOVD reg32, xmmreg Double FSTORE 4
881 On AMDFAM10:
882 MOVD reg64, xmmreg Double FADD 3
883 1/1 1/1
884 MOVD reg32, xmmreg Double FADD 3
885 1/1 1/1 */
886 16, /* size of l1 cache. */
887 2048, /* size of l2 cache. */
888 64, /* size of prefetch block */
889 /* New AMD processors never drop prefetches; if they cannot be performed
890 immediately, they are queued. We set number of simultaneous prefetches
891 to a large constant to reflect this (it probably is not a good idea not
892 to limit number of prefetches at all, as their execution also takes some
893 time). */
894 100, /* number of parallel prefetches */
895 2, /* Branch cost */
896 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
897 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
898 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
899 COSTS_N_INSNS (2), /* cost of FABS instruction. */
900 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
901 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
903 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
904 very small blocks it is better to use loop. For large blocks, libcall
905 can do nontemporary accesses and beat inline considerably. */
906 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
907 {-1, rep_prefix_4_byte, false}}},
908 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
909 {-1, libcall, false}}}},
910 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
911 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
912 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
913 {-1, libcall, false}}}},
914 6, /* scalar_stmt_cost. */
915 4, /* scalar load_cost. */
916 4, /* scalar_store_cost. */
917 6, /* vec_stmt_cost. */
918 0, /* vec_to_scalar_cost. */
919 2, /* scalar_to_vec_cost. */
920 4, /* vec_align_load_cost. */
921 4, /* vec_unalign_load_cost. */
922 4, /* vec_store_cost. */
923 2, /* cond_taken_branch_cost. */
924 1, /* cond_not_taken_branch_cost. */
927 struct processor_costs bdver2_cost = {
928 COSTS_N_INSNS (1), /* cost of an add instruction */
929 COSTS_N_INSNS (1), /* cost of a lea instruction */
930 COSTS_N_INSNS (1), /* variable shift costs */
931 COSTS_N_INSNS (1), /* constant shift costs */
932 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
933 COSTS_N_INSNS (4), /* HI */
934 COSTS_N_INSNS (4), /* SI */
935 COSTS_N_INSNS (6), /* DI */
936 COSTS_N_INSNS (6)}, /* other */
937 0, /* cost of multiply per each bit set */
938 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
939 COSTS_N_INSNS (35), /* HI */
940 COSTS_N_INSNS (51), /* SI */
941 COSTS_N_INSNS (83), /* DI */
942 COSTS_N_INSNS (83)}, /* other */
943 COSTS_N_INSNS (1), /* cost of movsx */
944 COSTS_N_INSNS (1), /* cost of movzx */
945 8, /* "large" insn */
946 9, /* MOVE_RATIO */
947 4, /* cost for loading QImode using movzbl */
948 {5, 5, 4}, /* cost of loading integer registers
949 in QImode, HImode and SImode.
950 Relative to reg-reg move (2). */
951 {4, 4, 4}, /* cost of storing integer registers */
952 2, /* cost of reg,reg fld/fst */
953 {5, 5, 12}, /* cost of loading fp registers
954 in SFmode, DFmode and XFmode */
955 {4, 4, 8}, /* cost of storing fp registers
956 in SFmode, DFmode and XFmode */
957 2, /* cost of moving MMX register */
958 {4, 4}, /* cost of loading MMX registers
959 in SImode and DImode */
960 {4, 4}, /* cost of storing MMX registers
961 in SImode and DImode */
962 2, /* cost of moving SSE register */
963 {4, 4, 4}, /* cost of loading SSE registers
964 in SImode, DImode and TImode */
965 {4, 4, 4}, /* cost of storing SSE registers
966 in SImode, DImode and TImode */
967 2, /* MMX or SSE register to integer */
968 /* On K8:
969 MOVD reg64, xmmreg Double FSTORE 4
970 MOVD reg32, xmmreg Double FSTORE 4
971 On AMDFAM10:
972 MOVD reg64, xmmreg Double FADD 3
973 1/1 1/1
974 MOVD reg32, xmmreg Double FADD 3
975 1/1 1/1 */
976 16, /* size of l1 cache. */
977 2048, /* size of l2 cache. */
978 64, /* size of prefetch block */
979 /* New AMD processors never drop prefetches; if they cannot be performed
980 immediately, they are queued. We set number of simultaneous prefetches
981 to a large constant to reflect this (it probably is not a good idea not
982 to limit number of prefetches at all, as their execution also takes some
983 time). */
984 100, /* number of parallel prefetches */
985 2, /* Branch cost */
986 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
987 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
988 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
989 COSTS_N_INSNS (2), /* cost of FABS instruction. */
990 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
991 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
993 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
994 very small blocks it is better to use loop. For large blocks, libcall
995 can do nontemporary accesses and beat inline considerably. */
996 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
997 {-1, rep_prefix_4_byte, false}}},
998 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
999 {-1, libcall, false}}}},
1000 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
1001 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1002 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1003 {-1, libcall, false}}}},
1004 6, /* scalar_stmt_cost. */
1005 4, /* scalar load_cost. */
1006 4, /* scalar_store_cost. */
1007 6, /* vec_stmt_cost. */
1008 0, /* vec_to_scalar_cost. */
1009 2, /* scalar_to_vec_cost. */
1010 4, /* vec_align_load_cost. */
1011 4, /* vec_unalign_load_cost. */
1012 4, /* vec_store_cost. */
1013 2, /* cond_taken_branch_cost. */
1014 1, /* cond_not_taken_branch_cost. */
1017 struct processor_costs bdver3_cost = {
1018 COSTS_N_INSNS (1), /* cost of an add instruction */
1019 COSTS_N_INSNS (1), /* cost of a lea instruction */
1020 COSTS_N_INSNS (1), /* variable shift costs */
1021 COSTS_N_INSNS (1), /* constant shift costs */
1022 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1023 COSTS_N_INSNS (4), /* HI */
1024 COSTS_N_INSNS (4), /* SI */
1025 COSTS_N_INSNS (6), /* DI */
1026 COSTS_N_INSNS (6)}, /* other */
1027 0, /* cost of multiply per each bit set */
1028 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1029 COSTS_N_INSNS (35), /* HI */
1030 COSTS_N_INSNS (51), /* SI */
1031 COSTS_N_INSNS (83), /* DI */
1032 COSTS_N_INSNS (83)}, /* other */
1033 COSTS_N_INSNS (1), /* cost of movsx */
1034 COSTS_N_INSNS (1), /* cost of movzx */
1035 8, /* "large" insn */
1036 9, /* MOVE_RATIO */
1037 4, /* cost for loading QImode using movzbl */
1038 {5, 5, 4}, /* cost of loading integer registers
1039 in QImode, HImode and SImode.
1040 Relative to reg-reg move (2). */
1041 {4, 4, 4}, /* cost of storing integer registers */
1042 2, /* cost of reg,reg fld/fst */
1043 {5, 5, 12}, /* cost of loading fp registers
1044 in SFmode, DFmode and XFmode */
1045 {4, 4, 8}, /* cost of storing fp registers
1046 in SFmode, DFmode and XFmode */
1047 2, /* cost of moving MMX register */
1048 {4, 4}, /* cost of loading MMX registers
1049 in SImode and DImode */
1050 {4, 4}, /* cost of storing MMX registers
1051 in SImode and DImode */
1052 2, /* cost of moving SSE register */
1053 {4, 4, 4}, /* cost of loading SSE registers
1054 in SImode, DImode and TImode */
1055 {4, 4, 4}, /* cost of storing SSE registers
1056 in SImode, DImode and TImode */
1057 2, /* MMX or SSE register to integer */
1058 16, /* size of l1 cache. */
1059 2048, /* size of l2 cache. */
1060 64, /* size of prefetch block */
1061 /* New AMD processors never drop prefetches; if they cannot be performed
1062 immediately, they are queued. We set number of simultaneous prefetches
1063 to a large constant to reflect this (it probably is not a good idea not
1064 to limit number of prefetches at all, as their execution also takes some
1065 time). */
1066 100, /* number of parallel prefetches */
1067 2, /* Branch cost */
1068 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1069 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1070 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1071 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1072 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1073 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1075 /* BDVER3 has optimized REP instruction for medium sized blocks, but for
1076 very small blocks it is better to use loop. For large blocks, libcall
1077 can do nontemporary accesses and beat inline considerably. */
1078 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
1079 {-1, rep_prefix_4_byte, false}}},
1080 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1081 {-1, libcall, false}}}},
1082 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
1083 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1084 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1085 {-1, libcall, false}}}},
1086 6, /* scalar_stmt_cost. */
1087 4, /* scalar load_cost. */
1088 4, /* scalar_store_cost. */
1089 6, /* vec_stmt_cost. */
1090 0, /* vec_to_scalar_cost. */
1091 2, /* scalar_to_vec_cost. */
1092 4, /* vec_align_load_cost. */
1093 4, /* vec_unalign_load_cost. */
1094 4, /* vec_store_cost. */
1095 2, /* cond_taken_branch_cost. */
1096 1, /* cond_not_taken_branch_cost. */
1099 struct processor_costs btver1_cost = {
1100 COSTS_N_INSNS (1), /* cost of an add instruction */
1101 COSTS_N_INSNS (2), /* cost of a lea instruction */
1102 COSTS_N_INSNS (1), /* variable shift costs */
1103 COSTS_N_INSNS (1), /* constant shift costs */
1104 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1105 COSTS_N_INSNS (4), /* HI */
1106 COSTS_N_INSNS (3), /* SI */
1107 COSTS_N_INSNS (4), /* DI */
1108 COSTS_N_INSNS (5)}, /* other */
1109 0, /* cost of multiply per each bit set */
1110 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1111 COSTS_N_INSNS (35), /* HI */
1112 COSTS_N_INSNS (51), /* SI */
1113 COSTS_N_INSNS (83), /* DI */
1114 COSTS_N_INSNS (83)}, /* other */
1115 COSTS_N_INSNS (1), /* cost of movsx */
1116 COSTS_N_INSNS (1), /* cost of movzx */
1117 8, /* "large" insn */
1118 9, /* MOVE_RATIO */
1119 4, /* cost for loading QImode using movzbl */
1120 {3, 4, 3}, /* cost of loading integer registers
1121 in QImode, HImode and SImode.
1122 Relative to reg-reg move (2). */
1123 {3, 4, 3}, /* cost of storing integer registers */
1124 4, /* cost of reg,reg fld/fst */
1125 {4, 4, 12}, /* cost of loading fp registers
1126 in SFmode, DFmode and XFmode */
1127 {6, 6, 8}, /* cost of storing fp registers
1128 in SFmode, DFmode and XFmode */
1129 2, /* cost of moving MMX register */
1130 {3, 3}, /* cost of loading MMX registers
1131 in SImode and DImode */
1132 {4, 4}, /* cost of storing MMX registers
1133 in SImode and DImode */
1134 2, /* cost of moving SSE register */
1135 {4, 4, 3}, /* cost of loading SSE registers
1136 in SImode, DImode and TImode */
1137 {4, 4, 5}, /* cost of storing SSE registers
1138 in SImode, DImode and TImode */
1139 3, /* MMX or SSE register to integer */
1140 /* On K8:
1141 MOVD reg64, xmmreg Double FSTORE 4
1142 MOVD reg32, xmmreg Double FSTORE 4
1143 On AMDFAM10:
1144 MOVD reg64, xmmreg Double FADD 3
1145 1/1 1/1
1146 MOVD reg32, xmmreg Double FADD 3
1147 1/1 1/1 */
1148 32, /* size of l1 cache. */
1149 512, /* size of l2 cache. */
1150 64, /* size of prefetch block */
1151 100, /* number of parallel prefetches */
1152 2, /* Branch cost */
1153 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1154 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1155 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1156 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1157 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1158 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1160 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1161 very small blocks it is better to use loop. For large blocks, libcall can
1162 do nontemporary accesses and beat inline considerably. */
1163 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
1164 {-1, rep_prefix_4_byte, false}}},
1165 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1166 {-1, libcall, false}}}},
1167 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
1168 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1169 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1170 {-1, libcall, false}}}},
1171 4, /* scalar_stmt_cost. */
1172 2, /* scalar load_cost. */
1173 2, /* scalar_store_cost. */
1174 6, /* vec_stmt_cost. */
1175 0, /* vec_to_scalar_cost. */
1176 2, /* scalar_to_vec_cost. */
1177 2, /* vec_align_load_cost. */
1178 2, /* vec_unalign_load_cost. */
1179 2, /* vec_store_cost. */
1180 2, /* cond_taken_branch_cost. */
1181 1, /* cond_not_taken_branch_cost. */
1184 struct processor_costs btver2_cost = {
1185 COSTS_N_INSNS (1), /* cost of an add instruction */
1186 COSTS_N_INSNS (2), /* cost of a lea instruction */
1187 COSTS_N_INSNS (1), /* variable shift costs */
1188 COSTS_N_INSNS (1), /* constant shift costs */
1189 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1190 COSTS_N_INSNS (4), /* HI */
1191 COSTS_N_INSNS (3), /* SI */
1192 COSTS_N_INSNS (4), /* DI */
1193 COSTS_N_INSNS (5)}, /* other */
1194 0, /* cost of multiply per each bit set */
1195 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1196 COSTS_N_INSNS (35), /* HI */
1197 COSTS_N_INSNS (51), /* SI */
1198 COSTS_N_INSNS (83), /* DI */
1199 COSTS_N_INSNS (83)}, /* other */
1200 COSTS_N_INSNS (1), /* cost of movsx */
1201 COSTS_N_INSNS (1), /* cost of movzx */
1202 8, /* "large" insn */
1203 9, /* MOVE_RATIO */
1204 4, /* cost for loading QImode using movzbl */
1205 {3, 4, 3}, /* cost of loading integer registers
1206 in QImode, HImode and SImode.
1207 Relative to reg-reg move (2). */
1208 {3, 4, 3}, /* cost of storing integer registers */
1209 4, /* cost of reg,reg fld/fst */
1210 {4, 4, 12}, /* cost of loading fp registers
1211 in SFmode, DFmode and XFmode */
1212 {6, 6, 8}, /* cost of storing fp registers
1213 in SFmode, DFmode and XFmode */
1214 2, /* cost of moving MMX register */
1215 {3, 3}, /* cost of loading MMX registers
1216 in SImode and DImode */
1217 {4, 4}, /* cost of storing MMX registers
1218 in SImode and DImode */
1219 2, /* cost of moving SSE register */
1220 {4, 4, 3}, /* cost of loading SSE registers
1221 in SImode, DImode and TImode */
1222 {4, 4, 5}, /* cost of storing SSE registers
1223 in SImode, DImode and TImode */
1224 3, /* MMX or SSE register to integer */
1225 /* On K8:
1226 MOVD reg64, xmmreg Double FSTORE 4
1227 MOVD reg32, xmmreg Double FSTORE 4
1228 On AMDFAM10:
1229 MOVD reg64, xmmreg Double FADD 3
1230 1/1 1/1
1231 MOVD reg32, xmmreg Double FADD 3
1232 1/1 1/1 */
1233 32, /* size of l1 cache. */
1234 2048, /* size of l2 cache. */
1235 64, /* size of prefetch block */
1236 100, /* number of parallel prefetches */
1237 2, /* Branch cost */
1238 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1239 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1240 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1241 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1242 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1243 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1245 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
1246 {-1, rep_prefix_4_byte, false}}},
1247 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1248 {-1, libcall, false}}}},
1249 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
1250 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1251 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1252 {-1, libcall, false}}}},
1253 4, /* scalar_stmt_cost. */
1254 2, /* scalar load_cost. */
1255 2, /* scalar_store_cost. */
1256 6, /* vec_stmt_cost. */
1257 0, /* vec_to_scalar_cost. */
1258 2, /* scalar_to_vec_cost. */
1259 2, /* vec_align_load_cost. */
1260 2, /* vec_unalign_load_cost. */
1261 2, /* vec_store_cost. */
1262 2, /* cond_taken_branch_cost. */
1263 1, /* cond_not_taken_branch_cost. */
1266 static const
1267 struct processor_costs pentium4_cost = {
1268 COSTS_N_INSNS (1), /* cost of an add instruction */
1269 COSTS_N_INSNS (3), /* cost of a lea instruction */
1270 COSTS_N_INSNS (4), /* variable shift costs */
1271 COSTS_N_INSNS (4), /* constant shift costs */
1272 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1273 COSTS_N_INSNS (15), /* HI */
1274 COSTS_N_INSNS (15), /* SI */
1275 COSTS_N_INSNS (15), /* DI */
1276 COSTS_N_INSNS (15)}, /* other */
1277 0, /* cost of multiply per each bit set */
1278 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1279 COSTS_N_INSNS (56), /* HI */
1280 COSTS_N_INSNS (56), /* SI */
1281 COSTS_N_INSNS (56), /* DI */
1282 COSTS_N_INSNS (56)}, /* other */
1283 COSTS_N_INSNS (1), /* cost of movsx */
1284 COSTS_N_INSNS (1), /* cost of movzx */
1285 16, /* "large" insn */
1286 6, /* MOVE_RATIO */
1287 2, /* cost for loading QImode using movzbl */
1288 {4, 5, 4}, /* cost of loading integer registers
1289 in QImode, HImode and SImode.
1290 Relative to reg-reg move (2). */
1291 {2, 3, 2}, /* cost of storing integer registers */
1292 2, /* cost of reg,reg fld/fst */
1293 {2, 2, 6}, /* cost of loading fp registers
1294 in SFmode, DFmode and XFmode */
1295 {4, 4, 6}, /* cost of storing fp registers
1296 in SFmode, DFmode and XFmode */
1297 2, /* cost of moving MMX register */
1298 {2, 2}, /* cost of loading MMX registers
1299 in SImode and DImode */
1300 {2, 2}, /* cost of storing MMX registers
1301 in SImode and DImode */
1302 12, /* cost of moving SSE register */
1303 {12, 12, 12}, /* cost of loading SSE registers
1304 in SImode, DImode and TImode */
1305 {2, 2, 8}, /* cost of storing SSE registers
1306 in SImode, DImode and TImode */
1307 10, /* MMX or SSE register to integer */
1308 8, /* size of l1 cache. */
1309 256, /* size of l2 cache. */
1310 64, /* size of prefetch block */
1311 6, /* number of parallel prefetches */
1312 2, /* Branch cost */
1313 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1314 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1315 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1316 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1317 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1318 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1319 {{libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1320 DUMMY_STRINGOP_ALGS},
1321 {{libcall, {{6, loop_1_byte, false}, {48, loop, false},
1322 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1323 DUMMY_STRINGOP_ALGS},
1324 1, /* scalar_stmt_cost. */
1325 1, /* scalar load_cost. */
1326 1, /* scalar_store_cost. */
1327 1, /* vec_stmt_cost. */
1328 1, /* vec_to_scalar_cost. */
1329 1, /* scalar_to_vec_cost. */
1330 1, /* vec_align_load_cost. */
1331 2, /* vec_unalign_load_cost. */
1332 1, /* vec_store_cost. */
1333 3, /* cond_taken_branch_cost. */
1334 1, /* cond_not_taken_branch_cost. */
1337 static const
1338 struct processor_costs nocona_cost = {
1339 COSTS_N_INSNS (1), /* cost of an add instruction */
1340 COSTS_N_INSNS (1), /* cost of a lea instruction */
1341 COSTS_N_INSNS (1), /* variable shift costs */
1342 COSTS_N_INSNS (1), /* constant shift costs */
1343 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1344 COSTS_N_INSNS (10), /* HI */
1345 COSTS_N_INSNS (10), /* SI */
1346 COSTS_N_INSNS (10), /* DI */
1347 COSTS_N_INSNS (10)}, /* other */
1348 0, /* cost of multiply per each bit set */
1349 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1350 COSTS_N_INSNS (66), /* HI */
1351 COSTS_N_INSNS (66), /* SI */
1352 COSTS_N_INSNS (66), /* DI */
1353 COSTS_N_INSNS (66)}, /* other */
1354 COSTS_N_INSNS (1), /* cost of movsx */
1355 COSTS_N_INSNS (1), /* cost of movzx */
1356 16, /* "large" insn */
1357 17, /* MOVE_RATIO */
1358 4, /* cost for loading QImode using movzbl */
1359 {4, 4, 4}, /* cost of loading integer registers
1360 in QImode, HImode and SImode.
1361 Relative to reg-reg move (2). */
1362 {4, 4, 4}, /* cost of storing integer registers */
1363 3, /* cost of reg,reg fld/fst */
1364 {12, 12, 12}, /* cost of loading fp registers
1365 in SFmode, DFmode and XFmode */
1366 {4, 4, 4}, /* cost of storing fp registers
1367 in SFmode, DFmode and XFmode */
1368 6, /* cost of moving MMX register */
1369 {12, 12}, /* cost of loading MMX registers
1370 in SImode and DImode */
1371 {12, 12}, /* cost of storing MMX registers
1372 in SImode and DImode */
1373 6, /* cost of moving SSE register */
1374 {12, 12, 12}, /* cost of loading SSE registers
1375 in SImode, DImode and TImode */
1376 {12, 12, 12}, /* cost of storing SSE registers
1377 in SImode, DImode and TImode */
1378 8, /* MMX or SSE register to integer */
1379 8, /* size of l1 cache. */
1380 1024, /* size of l2 cache. */
1381 128, /* size of prefetch block */
1382 8, /* number of parallel prefetches */
1383 1, /* Branch cost */
1384 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1385 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1386 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1387 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1388 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1389 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1390 {{libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1391 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1392 {100000, unrolled_loop, false}, {-1, libcall, false}}}},
1393 {{libcall, {{6, loop_1_byte, false}, {48, loop, false},
1394 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1395 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1396 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
1397 1, /* scalar_stmt_cost. */
1398 1, /* scalar load_cost. */
1399 1, /* scalar_store_cost. */
1400 1, /* vec_stmt_cost. */
1401 1, /* vec_to_scalar_cost. */
1402 1, /* scalar_to_vec_cost. */
1403 1, /* vec_align_load_cost. */
1404 2, /* vec_unalign_load_cost. */
1405 1, /* vec_store_cost. */
1406 3, /* cond_taken_branch_cost. */
1407 1, /* cond_not_taken_branch_cost. */
1410 static const
1411 struct processor_costs atom_cost = {
1412 COSTS_N_INSNS (1), /* cost of an add instruction */
1413 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1414 COSTS_N_INSNS (1), /* variable shift costs */
1415 COSTS_N_INSNS (1), /* constant shift costs */
1416 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1417 COSTS_N_INSNS (4), /* HI */
1418 COSTS_N_INSNS (3), /* SI */
1419 COSTS_N_INSNS (4), /* DI */
1420 COSTS_N_INSNS (2)}, /* other */
1421 0, /* cost of multiply per each bit set */
1422 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1423 COSTS_N_INSNS (26), /* HI */
1424 COSTS_N_INSNS (42), /* SI */
1425 COSTS_N_INSNS (74), /* DI */
1426 COSTS_N_INSNS (74)}, /* other */
1427 COSTS_N_INSNS (1), /* cost of movsx */
1428 COSTS_N_INSNS (1), /* cost of movzx */
1429 8, /* "large" insn */
1430 17, /* MOVE_RATIO */
1431 4, /* cost for loading QImode using movzbl */
1432 {4, 4, 4}, /* cost of loading integer registers
1433 in QImode, HImode and SImode.
1434 Relative to reg-reg move (2). */
1435 {4, 4, 4}, /* cost of storing integer registers */
1436 4, /* cost of reg,reg fld/fst */
1437 {12, 12, 12}, /* cost of loading fp registers
1438 in SFmode, DFmode and XFmode */
1439 {6, 6, 8}, /* cost of storing fp registers
1440 in SFmode, DFmode and XFmode */
1441 2, /* cost of moving MMX register */
1442 {8, 8}, /* cost of loading MMX registers
1443 in SImode and DImode */
1444 {8, 8}, /* cost of storing MMX registers
1445 in SImode and DImode */
1446 2, /* cost of moving SSE register */
1447 {8, 8, 8}, /* cost of loading SSE registers
1448 in SImode, DImode and TImode */
1449 {8, 8, 8}, /* cost of storing SSE registers
1450 in SImode, DImode and TImode */
1451 5, /* MMX or SSE register to integer */
1452 32, /* size of l1 cache. */
1453 256, /* size of l2 cache. */
1454 64, /* size of prefetch block */
1455 6, /* number of parallel prefetches */
1456 3, /* Branch cost */
1457 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1458 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1459 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1460 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1461 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1462 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1463 {{libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1464 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1465 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
1466 {{libcall, {{8, loop, false}, {15, unrolled_loop, false},
1467 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1468 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1469 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
1470 1, /* scalar_stmt_cost. */
1471 1, /* scalar load_cost. */
1472 1, /* scalar_store_cost. */
1473 1, /* vec_stmt_cost. */
1474 1, /* vec_to_scalar_cost. */
1475 1, /* scalar_to_vec_cost. */
1476 1, /* vec_align_load_cost. */
1477 2, /* vec_unalign_load_cost. */
1478 1, /* vec_store_cost. */
1479 3, /* cond_taken_branch_cost. */
1480 1, /* cond_not_taken_branch_cost. */
1483 /* Generic64 should produce code tuned for Nocona and K8. */
1484 static const
1485 struct processor_costs generic64_cost = {
1486 COSTS_N_INSNS (1), /* cost of an add instruction */
1487 /* On all chips taken into consideration lea is 2 cycles and more. With
1488 this cost however our current implementation of synth_mult results in
1489 use of unnecessary temporary registers causing regression on several
1490 SPECfp benchmarks. */
1491 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1492 COSTS_N_INSNS (1), /* variable shift costs */
1493 COSTS_N_INSNS (1), /* constant shift costs */
1494 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1495 COSTS_N_INSNS (4), /* HI */
1496 COSTS_N_INSNS (3), /* SI */
1497 COSTS_N_INSNS (4), /* DI */
1498 COSTS_N_INSNS (2)}, /* other */
1499 0, /* cost of multiply per each bit set */
1500 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1501 COSTS_N_INSNS (26), /* HI */
1502 COSTS_N_INSNS (42), /* SI */
1503 COSTS_N_INSNS (74), /* DI */
1504 COSTS_N_INSNS (74)}, /* other */
1505 COSTS_N_INSNS (1), /* cost of movsx */
1506 COSTS_N_INSNS (1), /* cost of movzx */
1507 8, /* "large" insn */
1508 17, /* MOVE_RATIO */
1509 4, /* cost for loading QImode using movzbl */
1510 {4, 4, 4}, /* cost of loading integer registers
1511 in QImode, HImode and SImode.
1512 Relative to reg-reg move (2). */
1513 {4, 4, 4}, /* cost of storing integer registers */
1514 4, /* cost of reg,reg fld/fst */
1515 {12, 12, 12}, /* cost of loading fp registers
1516 in SFmode, DFmode and XFmode */
1517 {6, 6, 8}, /* cost of storing fp registers
1518 in SFmode, DFmode and XFmode */
1519 2, /* cost of moving MMX register */
1520 {8, 8}, /* cost of loading MMX registers
1521 in SImode and DImode */
1522 {8, 8}, /* cost of storing MMX registers
1523 in SImode and DImode */
1524 2, /* cost of moving SSE register */
1525 {8, 8, 8}, /* cost of loading SSE registers
1526 in SImode, DImode and TImode */
1527 {8, 8, 8}, /* cost of storing SSE registers
1528 in SImode, DImode and TImode */
1529 5, /* MMX or SSE register to integer */
1530 32, /* size of l1 cache. */
1531 512, /* size of l2 cache. */
1532 64, /* size of prefetch block */
1533 6, /* number of parallel prefetches */
1534 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1535 value is increased to perhaps more appropriate value of 5. */
1536 3, /* Branch cost */
1537 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1538 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1539 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1540 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1541 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1542 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1543 {DUMMY_STRINGOP_ALGS,
1544 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1545 {-1, libcall, false}}}},
1546 {DUMMY_STRINGOP_ALGS,
1547 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1548 {-1, libcall, false}}}},
1549 1, /* scalar_stmt_cost. */
1550 1, /* scalar load_cost. */
1551 1, /* scalar_store_cost. */
1552 1, /* vec_stmt_cost. */
1553 1, /* vec_to_scalar_cost. */
1554 1, /* scalar_to_vec_cost. */
1555 1, /* vec_align_load_cost. */
1556 2, /* vec_unalign_load_cost. */
1557 1, /* vec_store_cost. */
1558 3, /* cond_taken_branch_cost. */
1559 1, /* cond_not_taken_branch_cost. */
1562 /* core_cost should produce code tuned for Core familly of CPUs. */
1563 static const
1564 struct processor_costs core_cost = {
1565 COSTS_N_INSNS (1), /* cost of an add instruction */
1566 /* On all chips taken into consideration lea is 2 cycles and more. With
1567 this cost however our current implementation of synth_mult results in
1568 use of unnecessary temporary registers causing regression on several
1569 SPECfp benchmarks. */
1570 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1571 COSTS_N_INSNS (1), /* variable shift costs */
1572 COSTS_N_INSNS (1), /* constant shift costs */
1573 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1574 COSTS_N_INSNS (4), /* HI */
1575 COSTS_N_INSNS (3), /* SI */
1576 COSTS_N_INSNS (4), /* DI */
1577 COSTS_N_INSNS (2)}, /* other */
1578 0, /* cost of multiply per each bit set */
1579 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1580 COSTS_N_INSNS (26), /* HI */
1581 COSTS_N_INSNS (42), /* SI */
1582 COSTS_N_INSNS (74), /* DI */
1583 COSTS_N_INSNS (74)}, /* other */
1584 COSTS_N_INSNS (1), /* cost of movsx */
1585 COSTS_N_INSNS (1), /* cost of movzx */
1586 8, /* "large" insn */
1587 17, /* MOVE_RATIO */
1588 4, /* cost for loading QImode using movzbl */
1589 {4, 4, 4}, /* cost of loading integer registers
1590 in QImode, HImode and SImode.
1591 Relative to reg-reg move (2). */
1592 {4, 4, 4}, /* cost of storing integer registers */
1593 4, /* cost of reg,reg fld/fst */
1594 {12, 12, 12}, /* cost of loading fp registers
1595 in SFmode, DFmode and XFmode */
1596 {6, 6, 8}, /* cost of storing fp registers
1597 in SFmode, DFmode and XFmode */
1598 2, /* cost of moving MMX register */
1599 {8, 8}, /* cost of loading MMX registers
1600 in SImode and DImode */
1601 {8, 8}, /* cost of storing MMX registers
1602 in SImode and DImode */
1603 2, /* cost of moving SSE register */
1604 {8, 8, 8}, /* cost of loading SSE registers
1605 in SImode, DImode and TImode */
1606 {8, 8, 8}, /* cost of storing SSE registers
1607 in SImode, DImode and TImode */
1608 5, /* MMX or SSE register to integer */
1609 64, /* size of l1 cache. */
1610 512, /* size of l2 cache. */
1611 64, /* size of prefetch block */
1612 6, /* number of parallel prefetches */
1613 /* FIXME perhaps more appropriate value is 5. */
1614 3, /* Branch cost */
1615 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1616 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1617 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1618 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1619 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1620 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1621 {{libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
1622 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
1623 {-1, libcall, false}}}},
1624 {{libcall, {{6, loop_1_byte, true},
1625 {24, loop, true},
1626 {8192, rep_prefix_4_byte, true},
1627 {-1, libcall, false}}},
1628 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
1629 {-1, libcall, false}}}},
1630 1, /* scalar_stmt_cost. */
1631 1, /* scalar load_cost. */
1632 1, /* scalar_store_cost. */
1633 1, /* vec_stmt_cost. */
1634 1, /* vec_to_scalar_cost. */
1635 1, /* scalar_to_vec_cost. */
1636 1, /* vec_align_load_cost. */
1637 2, /* vec_unalign_load_cost. */
1638 1, /* vec_store_cost. */
1639 3, /* cond_taken_branch_cost. */
1640 1, /* cond_not_taken_branch_cost. */
1643 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
1644 Athlon and K8. */
1645 static const
1646 struct processor_costs generic32_cost = {
1647 COSTS_N_INSNS (1), /* cost of an add instruction */
1648 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1649 COSTS_N_INSNS (1), /* variable shift costs */
1650 COSTS_N_INSNS (1), /* constant shift costs */
1651 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1652 COSTS_N_INSNS (4), /* HI */
1653 COSTS_N_INSNS (3), /* SI */
1654 COSTS_N_INSNS (4), /* DI */
1655 COSTS_N_INSNS (2)}, /* other */
1656 0, /* cost of multiply per each bit set */
1657 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1658 COSTS_N_INSNS (26), /* HI */
1659 COSTS_N_INSNS (42), /* SI */
1660 COSTS_N_INSNS (74), /* DI */
1661 COSTS_N_INSNS (74)}, /* other */
1662 COSTS_N_INSNS (1), /* cost of movsx */
1663 COSTS_N_INSNS (1), /* cost of movzx */
1664 8, /* "large" insn */
1665 17, /* MOVE_RATIO */
1666 4, /* cost for loading QImode using movzbl */
1667 {4, 4, 4}, /* cost of loading integer registers
1668 in QImode, HImode and SImode.
1669 Relative to reg-reg move (2). */
1670 {4, 4, 4}, /* cost of storing integer registers */
1671 4, /* cost of reg,reg fld/fst */
1672 {12, 12, 12}, /* cost of loading fp registers
1673 in SFmode, DFmode and XFmode */
1674 {6, 6, 8}, /* cost of storing fp registers
1675 in SFmode, DFmode and XFmode */
1676 2, /* cost of moving MMX register */
1677 {8, 8}, /* cost of loading MMX registers
1678 in SImode and DImode */
1679 {8, 8}, /* cost of storing MMX registers
1680 in SImode and DImode */
1681 2, /* cost of moving SSE register */
1682 {8, 8, 8}, /* cost of loading SSE registers
1683 in SImode, DImode and TImode */
1684 {8, 8, 8}, /* cost of storing SSE registers
1685 in SImode, DImode and TImode */
1686 5, /* MMX or SSE register to integer */
1687 32, /* size of l1 cache. */
1688 256, /* size of l2 cache. */
1689 64, /* size of prefetch block */
1690 6, /* number of parallel prefetches */
1691 3, /* Branch cost */
1692 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1693 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1694 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1695 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1696 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1697 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1698 {{libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1699 {-1, libcall, false}}},
1700 DUMMY_STRINGOP_ALGS},
1701 {{libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1702 {-1, libcall, false}}},
1703 DUMMY_STRINGOP_ALGS},
1704 1, /* scalar_stmt_cost. */
1705 1, /* scalar load_cost. */
1706 1, /* scalar_store_cost. */
1707 1, /* vec_stmt_cost. */
1708 1, /* vec_to_scalar_cost. */
1709 1, /* scalar_to_vec_cost. */
1710 1, /* vec_align_load_cost. */
1711 2, /* vec_unalign_load_cost. */
1712 1, /* vec_store_cost. */
1713 3, /* cond_taken_branch_cost. */
1714 1, /* cond_not_taken_branch_cost. */
1717 /* Set by -mtune. */
1718 const struct processor_costs *ix86_tune_cost = &pentium_cost;
1720 /* Set by -mtune or -Os. */
1721 const struct processor_costs *ix86_cost = &pentium_cost;
1723 /* Processor feature/optimization bitmasks. */
1724 #define m_386 (1<<PROCESSOR_I386)
1725 #define m_486 (1<<PROCESSOR_I486)
1726 #define m_PENT (1<<PROCESSOR_PENTIUM)
1727 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1728 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1729 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1730 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1731 #define m_CORE2 (1<<PROCESSOR_CORE2)
1732 #define m_COREI7 (1<<PROCESSOR_COREI7)
1733 #define m_HASWELL (1<<PROCESSOR_HASWELL)
1734 #define m_CORE_ALL (m_CORE2 | m_COREI7 | m_HASWELL)
1735 #define m_ATOM (1<<PROCESSOR_ATOM)
1737 #define m_GEODE (1<<PROCESSOR_GEODE)
1738 #define m_K6 (1<<PROCESSOR_K6)
1739 #define m_K6_GEODE (m_K6 | m_GEODE)
1740 #define m_K8 (1<<PROCESSOR_K8)
1741 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1742 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1743 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1744 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1745 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1746 #define m_BDVER3 (1<<PROCESSOR_BDVER3)
1747 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1748 #define m_BTVER2 (1<<PROCESSOR_BTVER2)
1749 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3)
1750 #define m_BTVER (m_BTVER1 | m_BTVER2)
1751 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER)
1753 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1754 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1756 /* Generic instruction choice should be common subset of supported CPUs
1757 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
1758 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
1760 /* Feature tests against the various tunings. */
1761 unsigned char ix86_tune_features[X86_TUNE_LAST];
1763 /* Feature tests against the various tunings used to create ix86_tune_features
1764 based on the processor mask. */
1765 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
1766 /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
1767 negatively, so enabling for Generic64 seems like good code size
1768 tradeoff. We can't enable it for 32bit generic because it does not
1769 work well with PPro base chips. */
1770 m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
1772 /* X86_TUNE_PUSH_MEMORY */
1773 m_386 | m_P4_NOCONA | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1775 /* X86_TUNE_ZERO_EXTEND_WITH_AND */
1776 m_486 | m_PENT,
1778 /* X86_TUNE_UNROLL_STRLEN */
1779 m_486 | m_PENT | m_PPRO | m_ATOM | m_CORE_ALL | m_K6 | m_AMD_MULTIPLE | m_GENERIC,
1781 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
1782 on simulation result. But after P4 was made, no performance benefit
1783 was observed with branch hints. It also increases the code size.
1784 As a result, icc never generates branch hints. */
1787 /* X86_TUNE_DOUBLE_WITH_ADD */
1788 ~m_386,
1790 /* X86_TUNE_USE_SAHF */
1791 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC,
1793 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
1794 partial dependencies. */
1795 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1797 /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
1798 register stalls on Generic32 compilation setting as well. However
1799 in current implementation the partial register stalls are not eliminated
1800 very well - they can be introduced via subregs synthesized by combine
1801 and can happen in caller/callee saving sequences. Because this option
1802 pays back little on PPro based chips and is in conflict with partial reg
1803 dependencies used by Athlon/P4 based chips, it is better to leave it off
1804 for generic32 for now. */
1805 m_PPRO,
1807 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
1808 m_CORE_ALL | m_GENERIC,
1810 /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
1811 * on 16-bit immediate moves into memory on Core2 and Corei7. */
1812 m_CORE_ALL | m_GENERIC,
1814 /* X86_TUNE_USE_HIMODE_FIOP */
1815 m_386 | m_486 | m_K6_GEODE,
1817 /* X86_TUNE_USE_SIMODE_FIOP */
1818 ~(m_PENT | m_PPRO | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC),
1820 /* X86_TUNE_USE_MOV0 */
1821 m_K6,
1823 /* X86_TUNE_USE_CLTD */
1824 ~(m_PENT | m_ATOM | m_K6),
1826 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
1827 m_PENT4,
1829 /* X86_TUNE_SPLIT_LONG_MOVES */
1830 m_PPRO,
1832 /* X86_TUNE_READ_MODIFY_WRITE */
1833 ~m_PENT,
1835 /* X86_TUNE_READ_MODIFY */
1836 ~(m_PENT | m_PPRO),
1838 /* X86_TUNE_PROMOTE_QIMODE */
1839 m_386 | m_486 | m_PENT | m_CORE_ALL | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1841 /* X86_TUNE_FAST_PREFIX */
1842 ~(m_386 | m_486 | m_PENT),
1844 /* X86_TUNE_SINGLE_STRINGOP */
1845 m_386 | m_P4_NOCONA,
1847 /* X86_TUNE_QIMODE_MATH */
1850 /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
1851 register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option
1852 might be considered for Generic32 if our scheme for avoiding partial
1853 stalls was more effective. */
1854 ~m_PPRO,
1856 /* X86_TUNE_PROMOTE_QI_REGS */
1859 /* X86_TUNE_PROMOTE_HI_REGS */
1860 m_PPRO,
1862 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
1863 over esp addition. */
1864 m_386 | m_486 | m_PENT | m_PPRO,
1866 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
1867 over esp addition. */
1868 m_PENT,
1870 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
1871 over esp subtraction. */
1872 m_386 | m_486 | m_PENT | m_K6_GEODE,
1874 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
1875 over esp subtraction. */
1876 m_PENT | m_K6_GEODE,
1878 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
1879 for DFmode copies */
1880 ~(m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_ATOM | m_GENERIC),
1882 /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
1883 m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
1885 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
1886 conflict here in between PPro/Pentium4 based chips that thread 128bit
1887 SSE registers as single units versus K8 based chips that divide SSE
1888 registers to two 64bit halves. This knob promotes all store destinations
1889 to be 128bit to allow register renaming on 128bit SSE units, but usually
1890 results in one extra microop on 64bit SSE units. Experimental results
1891 shows that disabling this option on P4 brings over 20% SPECfp regression,
1892 while enabling it on K8 brings roughly 2.4% regression that can be partly
1893 masked by careful scheduling of moves. */
1894 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMDFAM10 | m_BDVER | m_GENERIC,
1896 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
1897 m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER,
1899 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */
1900 m_COREI7 | m_BDVER,
1902 /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */
1903 m_BDVER ,
1905 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
1906 are resolved on SSE register parts instead of whole registers, so we may
1907 maintain just lower part of scalar values in proper format leaving the
1908 upper part undefined. */
1909 m_ATHLON_K8,
1911 /* X86_TUNE_SSE_TYPELESS_STORES */
1912 m_AMD_MULTIPLE,
1914 /* X86_TUNE_SSE_LOAD0_BY_PXOR */
1915 m_PPRO | m_P4_NOCONA,
1917 /* X86_TUNE_MEMORY_MISMATCH_STALL */
1918 m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
1920 /* X86_TUNE_PROLOGUE_USING_MOVE */
1921 m_PPRO | m_ATHLON_K8,
1923 /* X86_TUNE_EPILOGUE_USING_MOVE */
1924 m_PPRO | m_ATHLON_K8,
1926 /* X86_TUNE_SHIFT1 */
1927 ~m_486,
1929 /* X86_TUNE_USE_FFREEP */
1930 m_AMD_MULTIPLE,
1932 /* X86_TUNE_INTER_UNIT_MOVES */
1933 ~(m_AMD_MULTIPLE | m_GENERIC),
1935 /* X86_TUNE_INTER_UNIT_CONVERSIONS */
1936 ~(m_AMDFAM10 | m_BDVER ),
1938 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
1939 than 4 branch instructions in the 16 byte window. */
1940 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
1942 /* X86_TUNE_SCHEDULE */
1943 m_PENT | m_PPRO | m_CORE_ALL | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1945 /* X86_TUNE_USE_BT */
1946 m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
1948 /* X86_TUNE_USE_INCDEC */
1949 ~(m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_GENERIC),
1951 /* X86_TUNE_PAD_RETURNS */
1952 m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC,
1954 /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */
1955 m_ATOM,
1957 /* X86_TUNE_EXT_80387_CONSTANTS */
1958 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC,
1960 /* X86_TUNE_AVOID_VECTOR_DECODE */
1961 m_CORE_ALL | m_K8 | m_GENERIC64,
1963 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
1964 and SImode multiply, but 386 and 486 do HImode multiply faster. */
1965 ~(m_386 | m_486),
1967 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
1968 vector path on AMD machines. */
1969 m_CORE_ALL | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
1971 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
1972 machines. */
1973 m_CORE_ALL | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
1975 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
1976 than a MOV. */
1977 m_PENT,
1979 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
1980 but one byte longer. */
1981 m_PENT,
1983 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
1984 operand that cannot be represented using a modRM byte. The XOR
1985 replacement is long decoded, so this split helps here as well. */
1986 m_K6,
1988 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
1989 from FP to FP. */
1990 m_CORE_ALL | m_AMDFAM10 | m_GENERIC,
1992 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
1993 from integer to FP. */
1994 m_AMDFAM10,
1996 /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
1997 with a subsequent conditional jump instruction into a single
1998 compare-and-branch uop. */
1999 m_BDVER,
2001 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
2002 will impact LEA instruction selection. */
2003 m_ATOM,
2005 /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
2006 instructions. */
2007 ~m_ATOM,
2009 /* X86_SOFTARE_PREFETCHING_BENEFICIAL: Enable software prefetching
2010 at -O3. For the moment, the prefetching seems badly tuned for Intel
2011 chips. */
2012 m_K6_GEODE | m_AMD_MULTIPLE,
2014 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
2015 the auto-vectorizer. */
2016 m_BDVER | m_BTVER2,
2018 /* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations
2019 during reassociation of integer computation. */
2020 m_ATOM,
2022 /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
2023 during reassociation of fp computation. */
2024 m_ATOM,
2026 /* X86_TUNE_GENERAL_REGS_SSE_SPILL: Try to spill general regs to SSE
2027 regs instead of memory. */
2028 m_CORE_ALL,
2030 /* X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE: Try to avoid memory operands for
2031 a conditional move. */
2032 m_ATOM
2035 /* Feature tests against the various architecture variations. */
2036 unsigned char ix86_arch_features[X86_ARCH_LAST];
2038 /* Feature tests against the various architecture variations, used to create
2039 ix86_arch_features based on the processor mask. */
2040 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2041 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
2042 ~(m_386 | m_486 | m_PENT | m_K6),
2044 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2045 ~m_386,
2047 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2048 ~(m_386 | m_486),
2050 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2051 ~m_386,
2053 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2054 ~m_386,
2057 static const unsigned int x86_accumulate_outgoing_args
2058 = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC;
2060 static const unsigned int x86_arch_always_fancy_math_387
2061 = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC;
2063 static const unsigned int x86_avx256_split_unaligned_load
2064 = m_COREI7 | m_GENERIC;
2066 static const unsigned int x86_avx256_split_unaligned_store
2067 = m_COREI7 | m_BDVER | m_GENERIC;
2069 /* In case the average insn count for single function invocation is
2070 lower than this constant, emit fast (but longer) prologue and
2071 epilogue code. */
2072 #define FAST_PROLOGUE_INSN_COUNT 20
2074 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2075 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2076 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2077 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2079 /* Array of the smallest class containing reg number REGNO, indexed by
2080 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2082 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2084 /* ax, dx, cx, bx */
2085 AREG, DREG, CREG, BREG,
2086 /* si, di, bp, sp */
2087 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2088 /* FP registers */
2089 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2090 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2091 /* arg pointer */
2092 NON_Q_REGS,
2093 /* flags, fpsr, fpcr, frame */
2094 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2095 /* SSE registers */
2096 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2097 SSE_REGS, SSE_REGS,
2098 /* MMX registers */
2099 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2100 MMX_REGS, MMX_REGS,
2101 /* REX registers */
2102 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2103 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2104 /* SSE REX registers */
2105 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2106 SSE_REGS, SSE_REGS,
2109 /* The "default" register map used in 32bit mode. */
2111 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2113 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2114 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2115 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2116 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2117 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2118 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2119 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2122 /* The "default" register map used in 64bit mode. */
2124 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2126 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2127 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2128 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2129 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2130 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2131 8,9,10,11,12,13,14,15, /* extended integer registers */
2132 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2135 /* Define the register numbers to be used in Dwarf debugging information.
2136 The SVR4 reference port C compiler uses the following register numbers
2137 in its Dwarf output code:
2138 0 for %eax (gcc regno = 0)
2139 1 for %ecx (gcc regno = 2)
2140 2 for %edx (gcc regno = 1)
2141 3 for %ebx (gcc regno = 3)
2142 4 for %esp (gcc regno = 7)
2143 5 for %ebp (gcc regno = 6)
2144 6 for %esi (gcc regno = 4)
2145 7 for %edi (gcc regno = 5)
2146 The following three DWARF register numbers are never generated by
2147 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2148 believes these numbers have these meanings.
2149 8 for %eip (no gcc equivalent)
2150 9 for %eflags (gcc regno = 17)
2151 10 for %trapno (no gcc equivalent)
2152 It is not at all clear how we should number the FP stack registers
2153 for the x86 architecture. If the version of SDB on x86/svr4 were
2154 a bit less brain dead with respect to floating-point then we would
2155 have a precedent to follow with respect to DWARF register numbers
2156 for x86 FP registers, but the SDB on x86/svr4 is so completely
2157 broken with respect to FP registers that it is hardly worth thinking
2158 of it as something to strive for compatibility with.
2159 The version of x86/svr4 SDB I have at the moment does (partially)
2160 seem to believe that DWARF register number 11 is associated with
2161 the x86 register %st(0), but that's about all. Higher DWARF
2162 register numbers don't seem to be associated with anything in
2163 particular, and even for DWARF regno 11, SDB only seems to under-
2164 stand that it should say that a variable lives in %st(0) (when
2165 asked via an `=' command) if we said it was in DWARF regno 11,
2166 but SDB still prints garbage when asked for the value of the
2167 variable in question (via a `/' command).
2168 (Also note that the labels SDB prints for various FP stack regs
2169 when doing an `x' command are all wrong.)
2170 Note that these problems generally don't affect the native SVR4
2171 C compiler because it doesn't allow the use of -O with -g and
2172 because when it is *not* optimizing, it allocates a memory
2173 location for each floating-point variable, and the memory
2174 location is what gets described in the DWARF AT_location
2175 attribute for the variable in question.
2176 Regardless of the severe mental illness of the x86/svr4 SDB, we
2177 do something sensible here and we use the following DWARF
2178 register numbers. Note that these are all stack-top-relative
2179 numbers.
2180 11 for %st(0) (gcc regno = 8)
2181 12 for %st(1) (gcc regno = 9)
2182 13 for %st(2) (gcc regno = 10)
2183 14 for %st(3) (gcc regno = 11)
2184 15 for %st(4) (gcc regno = 12)
2185 16 for %st(5) (gcc regno = 13)
2186 17 for %st(6) (gcc regno = 14)
2187 18 for %st(7) (gcc regno = 15)
2189 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2191 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2192 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2193 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2194 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2195 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2196 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2197 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2200 /* Define parameter passing and return registers. */
2202 static int const x86_64_int_parameter_registers[6] =
2204 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2207 static int const x86_64_ms_abi_int_parameter_registers[4] =
2209 CX_REG, DX_REG, R8_REG, R9_REG
2212 static int const x86_64_int_return_registers[4] =
2214 AX_REG, DX_REG, DI_REG, SI_REG
2217 /* Define the structure for the machine field in struct function. */
2219 struct GTY(()) stack_local_entry {
2220 unsigned short mode;
2221 unsigned short n;
2222 rtx rtl;
2223 struct stack_local_entry *next;
2226 /* Structure describing stack frame layout.
2227 Stack grows downward:
2229 [arguments]
2230 <- ARG_POINTER
2231 saved pc
2233 saved static chain if ix86_static_chain_on_stack
2235 saved frame pointer if frame_pointer_needed
2236 <- HARD_FRAME_POINTER
2237 [saved regs]
2238 <- regs_save_offset
2239 [padding0]
2241 [saved SSE regs]
2242 <- sse_regs_save_offset
2243 [padding1] |
2244 | <- FRAME_POINTER
2245 [va_arg registers] |
2247 [frame] |
2249 [padding2] | = to_allocate
2250 <- STACK_POINTER
2252 struct ix86_frame
2254 int nsseregs;
2255 int nregs;
2256 int va_arg_size;
2257 int red_zone_size;
2258 int outgoing_arguments_size;
2260 /* The offsets relative to ARG_POINTER. */
2261 HOST_WIDE_INT frame_pointer_offset;
2262 HOST_WIDE_INT hard_frame_pointer_offset;
2263 HOST_WIDE_INT stack_pointer_offset;
2264 HOST_WIDE_INT hfp_save_offset;
2265 HOST_WIDE_INT reg_save_offset;
2266 HOST_WIDE_INT sse_reg_save_offset;
2268 /* When save_regs_using_mov is set, emit prologue using
2269 move instead of push instructions. */
2270 bool save_regs_using_mov;
2273 /* Which cpu are we scheduling for. */
2274 enum attr_cpu ix86_schedule;
2276 /* Which cpu are we optimizing for. */
2277 enum processor_type ix86_tune;
2279 /* Which instruction set architecture to use. */
2280 enum processor_type ix86_arch;
2282 /* True if processor has SSE prefetch instruction. */
2283 unsigned char x86_prefetch_sse;
2285 /* -mstackrealign option */
2286 static const char ix86_force_align_arg_pointer_string[]
2287 = "force_align_arg_pointer";
2289 static rtx (*ix86_gen_leave) (void);
2290 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2291 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2292 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2293 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2294 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2295 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2296 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2297 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2298 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2299 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2300 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2302 /* Preferred alignment for stack boundary in bits. */
2303 unsigned int ix86_preferred_stack_boundary;
2305 /* Alignment for incoming stack boundary in bits specified at
2306 command line. */
2307 static unsigned int ix86_user_incoming_stack_boundary;
2309 /* Default alignment for incoming stack boundary in bits. */
2310 static unsigned int ix86_default_incoming_stack_boundary;
2312 /* Alignment for incoming stack boundary in bits. */
2313 unsigned int ix86_incoming_stack_boundary;
2315 /* Calling abi specific va_list type nodes. */
2316 static GTY(()) tree sysv_va_list_type_node;
2317 static GTY(()) tree ms_va_list_type_node;
2319 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2320 char internal_label_prefix[16];
2321 int internal_label_prefix_len;
2323 /* Fence to use after loop using movnt. */
2324 tree x86_mfence;
2326 /* Register class used for passing given 64bit part of the argument.
2327 These represent classes as documented by the PS ABI, with the exception
2328 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2329 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2331 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2332 whenever possible (upper half does contain padding). */
2333 enum x86_64_reg_class
2335 X86_64_NO_CLASS,
2336 X86_64_INTEGER_CLASS,
2337 X86_64_INTEGERSI_CLASS,
2338 X86_64_SSE_CLASS,
2339 X86_64_SSESF_CLASS,
2340 X86_64_SSEDF_CLASS,
2341 X86_64_SSEUP_CLASS,
2342 X86_64_X87_CLASS,
2343 X86_64_X87UP_CLASS,
2344 X86_64_COMPLEX_X87_CLASS,
2345 X86_64_MEMORY_CLASS
2348 #define MAX_CLASSES 4
2350 /* Table of constants used by fldpi, fldln2, etc.... */
2351 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2352 static bool ext_80387_constants_init = 0;
2355 static struct machine_function * ix86_init_machine_status (void);
2356 static rtx ix86_function_value (const_tree, const_tree, bool);
2357 static bool ix86_function_value_regno_p (const unsigned int);
2358 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2359 const_tree);
2360 static rtx ix86_static_chain (const_tree, bool);
2361 static int ix86_function_regparm (const_tree, const_tree);
2362 static void ix86_compute_frame_layout (struct ix86_frame *);
2363 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2364 rtx, rtx, int);
2365 static void ix86_add_new_builtins (HOST_WIDE_INT);
2366 static tree ix86_canonical_va_list_type (tree);
2367 static void predict_jump (int);
2368 static unsigned int split_stack_prologue_scratch_regno (void);
2369 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2371 enum ix86_function_specific_strings
2373 IX86_FUNCTION_SPECIFIC_ARCH,
2374 IX86_FUNCTION_SPECIFIC_TUNE,
2375 IX86_FUNCTION_SPECIFIC_MAX
2378 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2379 const char *, enum fpmath_unit, bool);
2380 static void ix86_debug_options (void) ATTRIBUTE_UNUSED;
2381 static void ix86_function_specific_save (struct cl_target_option *);
2382 static void ix86_function_specific_restore (struct cl_target_option *);
2383 static void ix86_function_specific_print (FILE *, int,
2384 struct cl_target_option *);
2385 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2386 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2387 struct gcc_options *);
2388 static bool ix86_can_inline_p (tree, tree);
2389 static void ix86_set_current_function (tree);
2390 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2392 static enum calling_abi ix86_function_abi (const_tree);
2395 #ifndef SUBTARGET32_DEFAULT_CPU
2396 #define SUBTARGET32_DEFAULT_CPU "i386"
2397 #endif
2399 /* Whether -mtune= or -march= were specified */
2400 static int ix86_tune_defaulted;
2401 static int ix86_arch_specified;
2403 /* Vectorization library interface and handlers. */
2404 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2406 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2407 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2409 /* Processor target table, indexed by processor number */
2410 struct ptt
2412 const struct processor_costs *cost; /* Processor costs */
2413 const int align_loop; /* Default alignments. */
2414 const int align_loop_max_skip;
2415 const int align_jump;
2416 const int align_jump_max_skip;
2417 const int align_func;
2420 static const struct ptt processor_target_table[PROCESSOR_max] =
2422 {&i386_cost, 4, 3, 4, 3, 4},
2423 {&i486_cost, 16, 15, 16, 15, 16},
2424 {&pentium_cost, 16, 7, 16, 7, 16},
2425 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2426 {&geode_cost, 0, 0, 0, 0, 0},
2427 {&k6_cost, 32, 7, 32, 7, 32},
2428 {&athlon_cost, 16, 7, 16, 7, 16},
2429 {&pentium4_cost, 0, 0, 0, 0, 0},
2430 {&k8_cost, 16, 7, 16, 7, 16},
2431 {&nocona_cost, 0, 0, 0, 0, 0},
2432 /* Core 2 */
2433 {&core_cost, 16, 10, 16, 10, 16},
2434 /* Core i7 */
2435 {&core_cost, 16, 10, 16, 10, 16},
2436 /* Core avx2 */
2437 {&core_cost, 16, 10, 16, 10, 16},
2438 {&generic32_cost, 16, 7, 16, 7, 16},
2439 {&generic64_cost, 16, 10, 16, 10, 16},
2440 {&amdfam10_cost, 32, 24, 32, 7, 32},
2441 {&bdver1_cost, 32, 24, 32, 7, 32},
2442 {&bdver2_cost, 32, 24, 32, 7, 32},
2443 {&bdver3_cost, 32, 24, 32, 7, 32},
2444 {&btver1_cost, 32, 24, 32, 7, 32},
2445 {&btver2_cost, 32, 24, 32, 7, 32},
2446 {&atom_cost, 16, 15, 16, 7, 16}
2449 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2451 "generic",
2452 "i386",
2453 "i486",
2454 "pentium",
2455 "pentium-mmx",
2456 "pentiumpro",
2457 "pentium2",
2458 "pentium3",
2459 "pentium4",
2460 "pentium-m",
2461 "prescott",
2462 "nocona",
2463 "core2",
2464 "corei7",
2465 "core-avx2",
2466 "atom",
2467 "geode",
2468 "k6",
2469 "k6-2",
2470 "k6-3",
2471 "athlon",
2472 "athlon-4",
2473 "k8",
2474 "amdfam10",
2475 "bdver1",
2476 "bdver2",
2477 "bdver3",
2478 "btver1",
2479 "btver2"
2482 static bool
2483 gate_insert_vzeroupper (void)
2485 return TARGET_VZEROUPPER;
2488 static unsigned int
2489 rest_of_handle_insert_vzeroupper (void)
2491 int i;
2493 /* vzeroupper instructions are inserted immediately after reload to
2494 account for possible spills from 256bit registers. The pass
2495 reuses mode switching infrastructure by re-running mode insertion
2496 pass, so disable entities that have already been processed. */
2497 for (i = 0; i < MAX_386_ENTITIES; i++)
2498 ix86_optimize_mode_switching[i] = 0;
2500 ix86_optimize_mode_switching[AVX_U128] = 1;
2502 /* Call optimize_mode_switching. */
2503 pass_mode_switching.pass.execute ();
2504 return 0;
2507 struct rtl_opt_pass pass_insert_vzeroupper =
2510 RTL_PASS,
2511 "vzeroupper", /* name */
2512 OPTGROUP_NONE, /* optinfo_flags */
2513 gate_insert_vzeroupper, /* gate */
2514 rest_of_handle_insert_vzeroupper, /* execute */
2515 NULL, /* sub */
2516 NULL, /* next */
2517 0, /* static_pass_number */
2518 TV_NONE, /* tv_id */
2519 0, /* properties_required */
2520 0, /* properties_provided */
2521 0, /* properties_destroyed */
2522 0, /* todo_flags_start */
2523 TODO_df_finish | TODO_verify_rtl_sharing |
2524 0, /* todo_flags_finish */
2528 /* Return true if a red-zone is in use. */
2530 static inline bool
2531 ix86_using_red_zone (void)
2533 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2536 /* Return a string that documents the current -m options. The caller is
2537 responsible for freeing the string. */
2539 static char *
2540 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2541 const char *tune, enum fpmath_unit fpmath,
2542 bool add_nl_p)
2544 struct ix86_target_opts
2546 const char *option; /* option string */
2547 HOST_WIDE_INT mask; /* isa mask options */
2550 /* This table is ordered so that options like -msse4.2 that imply
2551 preceding options while match those first. */
2552 static struct ix86_target_opts isa_opts[] =
2554 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2555 { "-mfma", OPTION_MASK_ISA_FMA },
2556 { "-mxop", OPTION_MASK_ISA_XOP },
2557 { "-mlwp", OPTION_MASK_ISA_LWP },
2558 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2559 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2560 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2561 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2562 { "-msse3", OPTION_MASK_ISA_SSE3 },
2563 { "-msse2", OPTION_MASK_ISA_SSE2 },
2564 { "-msse", OPTION_MASK_ISA_SSE },
2565 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2566 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2567 { "-mmmx", OPTION_MASK_ISA_MMX },
2568 { "-mabm", OPTION_MASK_ISA_ABM },
2569 { "-mbmi", OPTION_MASK_ISA_BMI },
2570 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2571 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2572 { "-mhle", OPTION_MASK_ISA_HLE },
2573 { "-mfxsr", OPTION_MASK_ISA_FXSR },
2574 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
2575 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
2576 { "-madx", OPTION_MASK_ISA_ADX },
2577 { "-mtbm", OPTION_MASK_ISA_TBM },
2578 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2579 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2580 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2581 { "-maes", OPTION_MASK_ISA_AES },
2582 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2583 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2584 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2585 { "-mf16c", OPTION_MASK_ISA_F16C },
2586 { "-mrtm", OPTION_MASK_ISA_RTM },
2587 { "-mxsave", OPTION_MASK_ISA_XSAVE },
2588 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
2591 /* Flag options. */
2592 static struct ix86_target_opts flag_opts[] =
2594 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2595 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
2596 { "-m80387", MASK_80387 },
2597 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2598 { "-malign-double", MASK_ALIGN_DOUBLE },
2599 { "-mcld", MASK_CLD },
2600 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2601 { "-mieee-fp", MASK_IEEE_FP },
2602 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2603 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2604 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2605 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2606 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2607 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2608 { "-mno-red-zone", MASK_NO_RED_ZONE },
2609 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2610 { "-mrecip", MASK_RECIP },
2611 { "-mrtd", MASK_RTD },
2612 { "-msseregparm", MASK_SSEREGPARM },
2613 { "-mstack-arg-probe", MASK_STACK_PROBE },
2614 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2615 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2616 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2617 { "-mvzeroupper", MASK_VZEROUPPER },
2618 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2619 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2620 { "-mprefer-avx128", MASK_PREFER_AVX128},
2623 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2625 char isa_other[40];
2626 char target_other[40];
2627 unsigned num = 0;
2628 unsigned i, j;
2629 char *ret;
2630 char *ptr;
2631 size_t len;
2632 size_t line_len;
2633 size_t sep_len;
2634 const char *abi;
2636 memset (opts, '\0', sizeof (opts));
2638 /* Add -march= option. */
2639 if (arch)
2641 opts[num][0] = "-march=";
2642 opts[num++][1] = arch;
2645 /* Add -mtune= option. */
2646 if (tune)
2648 opts[num][0] = "-mtune=";
2649 opts[num++][1] = tune;
2652 /* Add -m32/-m64/-mx32. */
2653 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
2655 if ((isa & OPTION_MASK_ABI_64) != 0)
2656 abi = "-m64";
2657 else
2658 abi = "-mx32";
2659 isa &= ~ (OPTION_MASK_ISA_64BIT
2660 | OPTION_MASK_ABI_64
2661 | OPTION_MASK_ABI_X32);
2663 else
2664 abi = "-m32";
2665 opts[num++][0] = abi;
2667 /* Pick out the options in isa options. */
2668 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2670 if ((isa & isa_opts[i].mask) != 0)
2672 opts[num++][0] = isa_opts[i].option;
2673 isa &= ~ isa_opts[i].mask;
2677 if (isa && add_nl_p)
2679 opts[num++][0] = isa_other;
2680 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2681 isa);
2684 /* Add flag options. */
2685 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2687 if ((flags & flag_opts[i].mask) != 0)
2689 opts[num++][0] = flag_opts[i].option;
2690 flags &= ~ flag_opts[i].mask;
2694 if (flags && add_nl_p)
2696 opts[num++][0] = target_other;
2697 sprintf (target_other, "(other flags: %#x)", flags);
2700 /* Add -fpmath= option. */
2701 if (fpmath)
2703 opts[num][0] = "-mfpmath=";
2704 switch ((int) fpmath)
2706 case FPMATH_387:
2707 opts[num++][1] = "387";
2708 break;
2710 case FPMATH_SSE:
2711 opts[num++][1] = "sse";
2712 break;
2714 case FPMATH_387 | FPMATH_SSE:
2715 opts[num++][1] = "sse+387";
2716 break;
2718 default:
2719 gcc_unreachable ();
2723 /* Any options? */
2724 if (num == 0)
2725 return NULL;
2727 gcc_assert (num < ARRAY_SIZE (opts));
2729 /* Size the string. */
2730 len = 0;
2731 sep_len = (add_nl_p) ? 3 : 1;
2732 for (i = 0; i < num; i++)
2734 len += sep_len;
2735 for (j = 0; j < 2; j++)
2736 if (opts[i][j])
2737 len += strlen (opts[i][j]);
2740 /* Build the string. */
2741 ret = ptr = (char *) xmalloc (len);
2742 line_len = 0;
2744 for (i = 0; i < num; i++)
2746 size_t len2[2];
2748 for (j = 0; j < 2; j++)
2749 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2751 if (i != 0)
2753 *ptr++ = ' ';
2754 line_len++;
2756 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2758 *ptr++ = '\\';
2759 *ptr++ = '\n';
2760 line_len = 0;
2764 for (j = 0; j < 2; j++)
2765 if (opts[i][j])
2767 memcpy (ptr, opts[i][j], len2[j]);
2768 ptr += len2[j];
2769 line_len += len2[j];
2773 *ptr = '\0';
2774 gcc_assert (ret + len >= ptr);
2776 return ret;
2779 /* Return true, if profiling code should be emitted before
2780 prologue. Otherwise it returns false.
2781 Note: For x86 with "hotfix" it is sorried. */
2782 static bool
2783 ix86_profile_before_prologue (void)
2785 return flag_fentry != 0;
2788 /* Function that is callable from the debugger to print the current
2789 options. */
2790 void
2791 ix86_debug_options (void)
2793 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2794 ix86_arch_string, ix86_tune_string,
2795 ix86_fpmath, true);
2797 if (opts)
2799 fprintf (stderr, "%s\n\n", opts);
2800 free (opts);
2802 else
2803 fputs ("<no options>\n\n", stderr);
2805 return;
2808 /* Override various settings based on options. If MAIN_ARGS_P, the
2809 options are from the command line, otherwise they are from
2810 attributes. */
2812 static void
2813 ix86_option_override_internal (bool main_args_p)
2815 int i;
2816 unsigned int ix86_arch_mask, ix86_tune_mask;
2817 const bool ix86_tune_specified = (ix86_tune_string != NULL);
2818 const char *prefix;
2819 const char *suffix;
2820 const char *sw;
2822 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
2823 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
2824 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
2825 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
2826 #define PTA_AES (HOST_WIDE_INT_1 << 4)
2827 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
2828 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
2829 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
2830 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
2831 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
2832 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
2833 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
2834 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
2835 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
2836 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
2837 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
2838 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
2839 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
2840 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
2841 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
2842 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
2843 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
2844 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
2845 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
2846 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
2847 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
2848 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
2849 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
2850 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
2851 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
2852 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
2853 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
2854 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
2855 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
2856 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
2857 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
2858 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
2859 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
2860 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
2861 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
2863 /* if this reaches 64, need to widen struct pta flags below */
2865 static struct pta
2867 const char *const name; /* processor name or nickname. */
2868 const enum processor_type processor;
2869 const enum attr_cpu schedule;
2870 const unsigned HOST_WIDE_INT flags;
2872 const processor_alias_table[] =
2874 {"i386", PROCESSOR_I386, CPU_NONE, 0},
2875 {"i486", PROCESSOR_I486, CPU_NONE, 0},
2876 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2877 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2878 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
2879 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
2880 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2881 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2882 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_SSE},
2883 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2884 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2885 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
2886 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2887 PTA_MMX | PTA_SSE | PTA_FXSR},
2888 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2889 PTA_MMX | PTA_SSE | PTA_FXSR},
2890 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2891 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
2892 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
2893 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
2894 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
2895 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
2896 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
2897 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
2898 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
2899 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2900 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
2901 {"core2", PROCESSOR_CORE2, CPU_CORE2,
2902 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2903 | PTA_SSSE3 | PTA_CX16 | PTA_FXSR},
2904 {"corei7", PROCESSOR_COREI7, CPU_COREI7,
2905 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2906 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_FXSR},
2907 {"corei7-avx", PROCESSOR_COREI7, CPU_COREI7,
2908 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2909 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2910 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL
2911 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
2912 {"core-avx-i", PROCESSOR_COREI7, CPU_COREI7,
2913 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2914 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2915 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2916 | PTA_RDRND | PTA_F16C | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
2917 {"core-avx2", PROCESSOR_HASWELL, CPU_COREI7,
2918 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2919 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
2920 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2921 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
2922 | PTA_FMA | PTA_MOVBE | PTA_RTM | PTA_HLE | PTA_FXSR | PTA_XSAVE
2923 | PTA_XSAVEOPT},
2924 {"atom", PROCESSOR_ATOM, CPU_ATOM,
2925 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2926 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE | PTA_FXSR},
2927 {"geode", PROCESSOR_GEODE, CPU_GEODE,
2928 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
2929 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
2930 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
2931 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
2932 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
2933 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
2934 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
2935 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
2936 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
2937 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
2938 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
2939 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
2940 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
2941 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
2942 {"x86-64", PROCESSOR_K8, CPU_K8,
2943 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF},
2944 {"k8", PROCESSOR_K8, CPU_K8,
2945 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2946 | PTA_SSE2 | PTA_NO_SAHF},
2947 {"k8-sse3", PROCESSOR_K8, CPU_K8,
2948 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2949 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
2950 {"opteron", PROCESSOR_K8, CPU_K8,
2951 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2952 | PTA_SSE2 | PTA_NO_SAHF},
2953 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
2954 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2955 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
2956 {"athlon64", PROCESSOR_K8, CPU_K8,
2957 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2958 | PTA_SSE2 | PTA_NO_SAHF},
2959 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
2960 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2961 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
2962 {"athlon-fx", PROCESSOR_K8, CPU_K8,
2963 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2964 | PTA_SSE2 | PTA_NO_SAHF},
2965 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
2966 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2967 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
2968 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
2969 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2970 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
2971 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
2972 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2973 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
2974 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
2975 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
2976 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
2977 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2978 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
2979 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
2980 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
2981 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
2982 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
2983 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2984 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
2985 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
2986 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
2987 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
2988 | PTA_XSAVEOPT},
2989 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC64,
2990 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2991 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
2992 | PTA_FXSR | PTA_XSAVE},
2993 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
2994 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2995 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
2996 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
2997 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
2998 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3000 {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO,
3001 PTA_HLE /* flags are only used for -march switch. */ },
3002 {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64,
3003 PTA_64BIT
3004 | PTA_HLE /* flags are only used for -march switch. */ },
3007 /* -mrecip options. */
3008 static struct
3010 const char *string; /* option name */
3011 unsigned int mask; /* mask bits to set */
3013 const recip_options[] =
3015 { "all", RECIP_MASK_ALL },
3016 { "none", RECIP_MASK_NONE },
3017 { "div", RECIP_MASK_DIV },
3018 { "sqrt", RECIP_MASK_SQRT },
3019 { "vec-div", RECIP_MASK_VEC_DIV },
3020 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3023 int const pta_size = ARRAY_SIZE (processor_alias_table);
3025 /* Set up prefix/suffix so the error messages refer to either the command
3026 line argument, or the attribute(target). */
3027 if (main_args_p)
3029 prefix = "-m";
3030 suffix = "";
3031 sw = "switch";
3033 else
3035 prefix = "option(\"";
3036 suffix = "\")";
3037 sw = "attribute";
3040 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
3041 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
3042 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT)
3043 ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
3044 #ifdef TARGET_BI_ARCH
3045 else
3047 #if TARGET_BI_ARCH == 1
3048 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
3049 is on and OPTION_MASK_ABI_X32 is off. We turn off
3050 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
3051 -mx32. */
3052 if (TARGET_X32)
3053 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3054 #else
3055 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
3056 on and OPTION_MASK_ABI_64 is off. We turn off
3057 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
3058 -m64. */
3059 if (TARGET_LP64)
3060 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3061 #endif
3063 #endif
3065 if (TARGET_X32)
3067 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3068 OPTION_MASK_ABI_64 for TARGET_X32. */
3069 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3070 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3072 else if (TARGET_LP64)
3074 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3075 OPTION_MASK_ABI_X32 for TARGET_LP64. */
3076 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3077 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3080 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3081 SUBTARGET_OVERRIDE_OPTIONS;
3082 #endif
3084 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3085 SUBSUBTARGET_OVERRIDE_OPTIONS;
3086 #endif
3088 /* -fPIC is the default for x86_64. */
3089 if (TARGET_MACHO && TARGET_64BIT)
3090 flag_pic = 2;
3092 /* Need to check -mtune=generic first. */
3093 if (ix86_tune_string)
3095 if (!strcmp (ix86_tune_string, "generic")
3096 || !strcmp (ix86_tune_string, "i686")
3097 /* As special support for cross compilers we read -mtune=native
3098 as -mtune=generic. With native compilers we won't see the
3099 -mtune=native, as it was changed by the driver. */
3100 || !strcmp (ix86_tune_string, "native"))
3102 if (TARGET_64BIT)
3103 ix86_tune_string = "generic64";
3104 else
3105 ix86_tune_string = "generic32";
3107 /* If this call is for setting the option attribute, allow the
3108 generic32/generic64 that was previously set. */
3109 else if (!main_args_p
3110 && (!strcmp (ix86_tune_string, "generic32")
3111 || !strcmp (ix86_tune_string, "generic64")))
3113 else if (!strncmp (ix86_tune_string, "generic", 7))
3114 error ("bad value (%s) for %stune=%s %s",
3115 ix86_tune_string, prefix, suffix, sw);
3116 else if (!strcmp (ix86_tune_string, "x86-64"))
3117 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3118 "%stune=k8%s or %stune=generic%s instead as appropriate",
3119 prefix, suffix, prefix, suffix, prefix, suffix);
3121 else
3123 if (ix86_arch_string)
3124 ix86_tune_string = ix86_arch_string;
3125 if (!ix86_tune_string)
3127 ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3128 ix86_tune_defaulted = 1;
3131 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
3132 need to use a sensible tune option. */
3133 if (!strcmp (ix86_tune_string, "generic")
3134 || !strcmp (ix86_tune_string, "x86-64")
3135 || !strcmp (ix86_tune_string, "i686"))
3137 if (TARGET_64BIT)
3138 ix86_tune_string = "generic64";
3139 else
3140 ix86_tune_string = "generic32";
3144 if (ix86_stringop_alg == rep_prefix_8_byte && !TARGET_64BIT)
3146 /* rep; movq isn't available in 32-bit code. */
3147 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3148 ix86_stringop_alg = no_stringop;
3151 if (!ix86_arch_string)
3152 ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3153 else
3154 ix86_arch_specified = 1;
3156 if (global_options_set.x_ix86_pmode)
3158 if ((TARGET_LP64 && ix86_pmode == PMODE_SI)
3159 || (!TARGET_64BIT && ix86_pmode == PMODE_DI))
3160 error ("address mode %qs not supported in the %s bit mode",
3161 TARGET_64BIT ? "short" : "long",
3162 TARGET_64BIT ? "64" : "32");
3164 else
3165 ix86_pmode = TARGET_LP64 ? PMODE_DI : PMODE_SI;
3167 if (!global_options_set.x_ix86_abi)
3168 ix86_abi = DEFAULT_ABI;
3170 if (global_options_set.x_ix86_cmodel)
3172 switch (ix86_cmodel)
3174 case CM_SMALL:
3175 case CM_SMALL_PIC:
3176 if (flag_pic)
3177 ix86_cmodel = CM_SMALL_PIC;
3178 if (!TARGET_64BIT)
3179 error ("code model %qs not supported in the %s bit mode",
3180 "small", "32");
3181 break;
3183 case CM_MEDIUM:
3184 case CM_MEDIUM_PIC:
3185 if (flag_pic)
3186 ix86_cmodel = CM_MEDIUM_PIC;
3187 if (!TARGET_64BIT)
3188 error ("code model %qs not supported in the %s bit mode",
3189 "medium", "32");
3190 else if (TARGET_X32)
3191 error ("code model %qs not supported in x32 mode",
3192 "medium");
3193 break;
3195 case CM_LARGE:
3196 case CM_LARGE_PIC:
3197 if (flag_pic)
3198 ix86_cmodel = CM_LARGE_PIC;
3199 if (!TARGET_64BIT)
3200 error ("code model %qs not supported in the %s bit mode",
3201 "large", "32");
3202 else if (TARGET_X32)
3203 error ("code model %qs not supported in x32 mode",
3204 "large");
3205 break;
3207 case CM_32:
3208 if (flag_pic)
3209 error ("code model %s does not support PIC mode", "32");
3210 if (TARGET_64BIT)
3211 error ("code model %qs not supported in the %s bit mode",
3212 "32", "64");
3213 break;
3215 case CM_KERNEL:
3216 if (flag_pic)
3218 error ("code model %s does not support PIC mode", "kernel");
3219 ix86_cmodel = CM_32;
3221 if (!TARGET_64BIT)
3222 error ("code model %qs not supported in the %s bit mode",
3223 "kernel", "32");
3224 break;
3226 default:
3227 gcc_unreachable ();
3230 else
3232 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3233 use of rip-relative addressing. This eliminates fixups that
3234 would otherwise be needed if this object is to be placed in a
3235 DLL, and is essentially just as efficient as direct addressing. */
3236 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
3237 ix86_cmodel = CM_SMALL_PIC, flag_pic = 1;
3238 else if (TARGET_64BIT && TARGET_RDOS)
3239 ix86_cmodel = CM_MEDIUM_PIC, flag_pic = 1;
3240 else if (TARGET_64BIT)
3241 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
3242 else
3243 ix86_cmodel = CM_32;
3245 if (TARGET_MACHO && ix86_asm_dialect == ASM_INTEL)
3247 error ("-masm=intel not supported in this configuration");
3248 ix86_asm_dialect = ASM_ATT;
3250 if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3251 sorry ("%i-bit mode not compiled in",
3252 (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3254 for (i = 0; i < pta_size; i++)
3255 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
3257 ix86_schedule = processor_alias_table[i].schedule;
3258 ix86_arch = processor_alias_table[i].processor;
3259 /* Default cpu tuning to the architecture. */
3260 ix86_tune = ix86_arch;
3262 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
3263 error ("CPU you selected does not support x86-64 "
3264 "instruction set");
3266 if (processor_alias_table[i].flags & PTA_MMX
3267 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3268 ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3269 if (processor_alias_table[i].flags & PTA_3DNOW
3270 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3271 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3272 if (processor_alias_table[i].flags & PTA_3DNOW_A
3273 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3274 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3275 if (processor_alias_table[i].flags & PTA_SSE
3276 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3277 ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3278 if (processor_alias_table[i].flags & PTA_SSE2
3279 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3280 ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3281 if (processor_alias_table[i].flags & PTA_SSE3
3282 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3283 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3284 if (processor_alias_table[i].flags & PTA_SSSE3
3285 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3286 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3287 if (processor_alias_table[i].flags & PTA_SSE4_1
3288 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3289 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3290 if (processor_alias_table[i].flags & PTA_SSE4_2
3291 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3292 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3293 if (processor_alias_table[i].flags & PTA_AVX
3294 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3295 ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3296 if (processor_alias_table[i].flags & PTA_AVX2
3297 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3298 ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3299 if (processor_alias_table[i].flags & PTA_FMA
3300 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3301 ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3302 if (processor_alias_table[i].flags & PTA_SSE4A
3303 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3304 ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3305 if (processor_alias_table[i].flags & PTA_FMA4
3306 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3307 ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3308 if (processor_alias_table[i].flags & PTA_XOP
3309 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3310 ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3311 if (processor_alias_table[i].flags & PTA_LWP
3312 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3313 ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3314 if (processor_alias_table[i].flags & PTA_ABM
3315 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3316 ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3317 if (processor_alias_table[i].flags & PTA_BMI
3318 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3319 ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3320 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3321 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3322 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3323 if (processor_alias_table[i].flags & PTA_TBM
3324 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3325 ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3326 if (processor_alias_table[i].flags & PTA_BMI2
3327 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3328 ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3329 if (processor_alias_table[i].flags & PTA_CX16
3330 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3331 ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3332 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3333 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3334 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3335 if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF))
3336 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3337 ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3338 if (processor_alias_table[i].flags & PTA_MOVBE
3339 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3340 ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3341 if (processor_alias_table[i].flags & PTA_AES
3342 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3343 ix86_isa_flags |= OPTION_MASK_ISA_AES;
3344 if (processor_alias_table[i].flags & PTA_PCLMUL
3345 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3346 ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3347 if (processor_alias_table[i].flags & PTA_FSGSBASE
3348 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3349 ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3350 if (processor_alias_table[i].flags & PTA_RDRND
3351 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3352 ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3353 if (processor_alias_table[i].flags & PTA_F16C
3354 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3355 ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3356 if (processor_alias_table[i].flags & PTA_RTM
3357 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
3358 ix86_isa_flags |= OPTION_MASK_ISA_RTM;
3359 if (processor_alias_table[i].flags & PTA_HLE
3360 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
3361 ix86_isa_flags |= OPTION_MASK_ISA_HLE;
3362 if (processor_alias_table[i].flags & PTA_PRFCHW
3363 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
3364 ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
3365 if (processor_alias_table[i].flags & PTA_RDSEED
3366 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
3367 ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
3368 if (processor_alias_table[i].flags & PTA_ADX
3369 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
3370 ix86_isa_flags |= OPTION_MASK_ISA_ADX;
3371 if (processor_alias_table[i].flags & PTA_FXSR
3372 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
3373 ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
3374 if (processor_alias_table[i].flags & PTA_XSAVE
3375 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
3376 ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
3377 if (processor_alias_table[i].flags & PTA_XSAVEOPT
3378 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
3379 ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
3380 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3381 x86_prefetch_sse = true;
3383 break;
3386 if (!strcmp (ix86_arch_string, "generic"))
3387 error ("generic CPU can be used only for %stune=%s %s",
3388 prefix, suffix, sw);
3389 else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size)
3390 error ("bad value (%s) for %sarch=%s %s",
3391 ix86_arch_string, prefix, suffix, sw);
3393 ix86_arch_mask = 1u << ix86_arch;
3394 for (i = 0; i < X86_ARCH_LAST; ++i)
3395 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3397 for (i = 0; i < pta_size; i++)
3398 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
3400 ix86_schedule = processor_alias_table[i].schedule;
3401 ix86_tune = processor_alias_table[i].processor;
3402 if (TARGET_64BIT)
3404 if (!(processor_alias_table[i].flags & PTA_64BIT))
3406 if (ix86_tune_defaulted)
3408 ix86_tune_string = "x86-64";
3409 for (i = 0; i < pta_size; i++)
3410 if (! strcmp (ix86_tune_string,
3411 processor_alias_table[i].name))
3412 break;
3413 ix86_schedule = processor_alias_table[i].schedule;
3414 ix86_tune = processor_alias_table[i].processor;
3416 else
3417 error ("CPU you selected does not support x86-64 "
3418 "instruction set");
3421 else
3423 /* Adjust tuning when compiling for 32-bit ABI. */
3424 switch (ix86_tune)
3426 case PROCESSOR_GENERIC64:
3427 ix86_tune = PROCESSOR_GENERIC32;
3428 ix86_schedule = CPU_PENTIUMPRO;
3429 break;
3431 default:
3432 break;
3435 /* Intel CPUs have always interpreted SSE prefetch instructions as
3436 NOPs; so, we can enable SSE prefetch instructions even when
3437 -mtune (rather than -march) points us to a processor that has them.
3438 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3439 higher processors. */
3440 if (TARGET_CMOV
3441 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3442 x86_prefetch_sse = true;
3443 break;
3446 if (ix86_tune_specified && i == pta_size)
3447 error ("bad value (%s) for %stune=%s %s",
3448 ix86_tune_string, prefix, suffix, sw);
3450 ix86_tune_mask = 1u << ix86_tune;
3451 for (i = 0; i < X86_TUNE_LAST; ++i)
3452 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3454 #ifndef USE_IX86_FRAME_POINTER
3455 #define USE_IX86_FRAME_POINTER 0
3456 #endif
3458 #ifndef USE_X86_64_FRAME_POINTER
3459 #define USE_X86_64_FRAME_POINTER 0
3460 #endif
3462 /* Set the default values for switches whose default depends on TARGET_64BIT
3463 in case they weren't overwritten by command line options. */
3464 if (TARGET_64BIT)
3466 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3467 flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3468 if (flag_asynchronous_unwind_tables == 2)
3469 flag_unwind_tables = flag_asynchronous_unwind_tables = 1;
3470 if (flag_pcc_struct_return == 2)
3471 flag_pcc_struct_return = 0;
3473 else
3475 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3476 flag_omit_frame_pointer = !(USE_IX86_FRAME_POINTER || optimize_size);
3477 if (flag_asynchronous_unwind_tables == 2)
3478 flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3479 if (flag_pcc_struct_return == 2)
3480 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3483 ix86_tune_cost = processor_target_table[ix86_tune].cost;
3484 if (optimize_size)
3485 ix86_cost = &ix86_size_cost;
3486 else
3487 ix86_cost = ix86_tune_cost;
3489 /* Arrange to set up i386_stack_locals for all functions. */
3490 init_machine_status = ix86_init_machine_status;
3492 /* Validate -mregparm= value. */
3493 if (global_options_set.x_ix86_regparm)
3495 if (TARGET_64BIT)
3496 warning (0, "-mregparm is ignored in 64-bit mode");
3497 if (ix86_regparm > REGPARM_MAX)
3499 error ("-mregparm=%d is not between 0 and %d",
3500 ix86_regparm, REGPARM_MAX);
3501 ix86_regparm = 0;
3504 if (TARGET_64BIT)
3505 ix86_regparm = REGPARM_MAX;
3507 /* Default align_* from the processor table. */
3508 if (align_loops == 0)
3510 align_loops = processor_target_table[ix86_tune].align_loop;
3511 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3513 if (align_jumps == 0)
3515 align_jumps = processor_target_table[ix86_tune].align_jump;
3516 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3518 if (align_functions == 0)
3520 align_functions = processor_target_table[ix86_tune].align_func;
3523 /* Provide default for -mbranch-cost= value. */
3524 if (!global_options_set.x_ix86_branch_cost)
3525 ix86_branch_cost = ix86_cost->branch_cost;
3527 if (TARGET_64BIT)
3529 target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit;
3531 /* Enable by default the SSE and MMX builtins. Do allow the user to
3532 explicitly disable any of these. In particular, disabling SSE and
3533 MMX for kernel code is extremely useful. */
3534 if (!ix86_arch_specified)
3535 ix86_isa_flags
3536 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3537 | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit);
3539 if (TARGET_RTD)
3540 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3542 else
3544 target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit;
3546 if (!ix86_arch_specified)
3547 ix86_isa_flags
3548 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit;
3550 /* i386 ABI does not specify red zone. It still makes sense to use it
3551 when programmer takes care to stack from being destroyed. */
3552 if (!(target_flags_explicit & MASK_NO_RED_ZONE))
3553 target_flags |= MASK_NO_RED_ZONE;
3556 /* Keep nonleaf frame pointers. */
3557 if (flag_omit_frame_pointer)
3558 target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3559 else if (TARGET_OMIT_LEAF_FRAME_POINTER)
3560 flag_omit_frame_pointer = 1;
3562 /* If we're doing fast math, we don't care about comparison order
3563 wrt NaNs. This lets us use a shorter comparison sequence. */
3564 if (flag_finite_math_only)
3565 target_flags &= ~MASK_IEEE_FP;
3567 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3568 since the insns won't need emulation. */
3569 if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
3570 target_flags &= ~MASK_NO_FANCY_MATH_387;
3572 /* Likewise, if the target doesn't have a 387, or we've specified
3573 software floating point, don't use 387 inline intrinsics. */
3574 if (!TARGET_80387)
3575 target_flags |= MASK_NO_FANCY_MATH_387;
3577 /* Turn on MMX builtins for -msse. */
3578 if (TARGET_SSE)
3579 ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit;
3581 /* Enable SSE prefetch. */
3582 if (TARGET_SSE || TARGET_PRFCHW)
3583 x86_prefetch_sse = true;
3585 /* Turn on popcnt instruction for -msse4.2 or -mabm. */
3586 if (TARGET_SSE4_2 || TARGET_ABM)
3587 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT & ~ix86_isa_flags_explicit;
3589 /* Turn on lzcnt instruction for -mabm. */
3590 if (TARGET_ABM)
3591 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT & ~ix86_isa_flags_explicit;
3593 /* Validate -mpreferred-stack-boundary= value or default it to
3594 PREFERRED_STACK_BOUNDARY_DEFAULT. */
3595 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3596 if (global_options_set.x_ix86_preferred_stack_boundary_arg)
3598 int min = (TARGET_64BIT ? (TARGET_SSE ? 4 : 3) : 2);
3599 int max = (TARGET_SEH ? 4 : 12);
3601 if (ix86_preferred_stack_boundary_arg < min
3602 || ix86_preferred_stack_boundary_arg > max)
3604 if (min == max)
3605 error ("-mpreferred-stack-boundary is not supported "
3606 "for this target");
3607 else
3608 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3609 ix86_preferred_stack_boundary_arg, min, max);
3611 else
3612 ix86_preferred_stack_boundary
3613 = (1 << ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3616 /* Set the default value for -mstackrealign. */
3617 if (ix86_force_align_arg_pointer == -1)
3618 ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3620 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3622 /* Validate -mincoming-stack-boundary= value or default it to
3623 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
3624 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3625 if (global_options_set.x_ix86_incoming_stack_boundary_arg)
3627 if (ix86_incoming_stack_boundary_arg < (TARGET_64BIT ? 4 : 2)
3628 || ix86_incoming_stack_boundary_arg > 12)
3629 error ("-mincoming-stack-boundary=%d is not between %d and 12",
3630 ix86_incoming_stack_boundary_arg, TARGET_64BIT ? 4 : 2);
3631 else
3633 ix86_user_incoming_stack_boundary
3634 = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3635 ix86_incoming_stack_boundary
3636 = ix86_user_incoming_stack_boundary;
3640 /* Accept -msseregparm only if at least SSE support is enabled. */
3641 if (TARGET_SSEREGPARM
3642 && ! TARGET_SSE)
3643 error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3645 if (global_options_set.x_ix86_fpmath)
3647 if (ix86_fpmath & FPMATH_SSE)
3649 if (!TARGET_SSE)
3651 warning (0, "SSE instruction set disabled, using 387 arithmetics");
3652 ix86_fpmath = FPMATH_387;
3654 else if ((ix86_fpmath & FPMATH_387) && !TARGET_80387)
3656 warning (0, "387 instruction set disabled, using SSE arithmetics");
3657 ix86_fpmath = FPMATH_SSE;
3661 else
3662 ix86_fpmath = TARGET_FPMATH_DEFAULT;
3664 /* If the i387 is disabled, then do not return values in it. */
3665 if (!TARGET_80387)
3666 target_flags &= ~MASK_FLOAT_RETURNS;
3668 /* Use external vectorized library in vectorizing intrinsics. */
3669 if (global_options_set.x_ix86_veclibabi_type)
3670 switch (ix86_veclibabi_type)
3672 case ix86_veclibabi_type_svml:
3673 ix86_veclib_handler = ix86_veclibabi_svml;
3674 break;
3676 case ix86_veclibabi_type_acml:
3677 ix86_veclib_handler = ix86_veclibabi_acml;
3678 break;
3680 default:
3681 gcc_unreachable ();
3684 if ((!USE_IX86_FRAME_POINTER
3685 || (x86_accumulate_outgoing_args & ix86_tune_mask))
3686 && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3687 && !optimize_size)
3688 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3690 /* ??? Unwind info is not correct around the CFG unless either a frame
3691 pointer is present or M_A_O_A is set. Fixing this requires rewriting
3692 unwind info generation to be aware of the CFG and propagating states
3693 around edges. */
3694 if ((flag_unwind_tables || flag_asynchronous_unwind_tables
3695 || flag_exceptions || flag_non_call_exceptions)
3696 && flag_omit_frame_pointer
3697 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3699 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3700 warning (0, "unwind tables currently require either a frame pointer "
3701 "or %saccumulate-outgoing-args%s for correctness",
3702 prefix, suffix);
3703 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3706 /* If stack probes are required, the space used for large function
3707 arguments on the stack must also be probed, so enable
3708 -maccumulate-outgoing-args so this happens in the prologue. */
3709 if (TARGET_STACK_PROBE
3710 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3712 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3713 warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3714 "for correctness", prefix, suffix);
3715 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3718 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
3720 char *p;
3721 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3722 p = strchr (internal_label_prefix, 'X');
3723 internal_label_prefix_len = p - internal_label_prefix;
3724 *p = '\0';
3727 /* When scheduling description is not available, disable scheduler pass
3728 so it won't slow down the compilation and make x87 code slower. */
3729 if (!TARGET_SCHEDULE)
3730 flag_schedule_insns_after_reload = flag_schedule_insns = 0;
3732 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3733 ix86_tune_cost->simultaneous_prefetches,
3734 global_options.x_param_values,
3735 global_options_set.x_param_values);
3736 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
3737 ix86_tune_cost->prefetch_block,
3738 global_options.x_param_values,
3739 global_options_set.x_param_values);
3740 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
3741 ix86_tune_cost->l1_cache_size,
3742 global_options.x_param_values,
3743 global_options_set.x_param_values);
3744 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
3745 ix86_tune_cost->l2_cache_size,
3746 global_options.x_param_values,
3747 global_options_set.x_param_values);
3749 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
3750 if (flag_prefetch_loop_arrays < 0
3751 && HAVE_prefetch
3752 && (optimize >= 3 || flag_profile_use)
3753 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
3754 flag_prefetch_loop_arrays = 1;
3756 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
3757 can be optimized to ap = __builtin_next_arg (0). */
3758 if (!TARGET_64BIT && !flag_split_stack)
3759 targetm.expand_builtin_va_start = NULL;
3761 if (TARGET_64BIT)
3763 ix86_gen_leave = gen_leave_rex64;
3764 if (Pmode == DImode)
3766 ix86_gen_monitor = gen_sse3_monitor64_di;
3767 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
3768 ix86_gen_tls_local_dynamic_base_64
3769 = gen_tls_local_dynamic_base_64_di;
3771 else
3773 ix86_gen_monitor = gen_sse3_monitor64_si;
3774 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
3775 ix86_gen_tls_local_dynamic_base_64
3776 = gen_tls_local_dynamic_base_64_si;
3779 else
3781 ix86_gen_leave = gen_leave;
3782 ix86_gen_monitor = gen_sse3_monitor;
3785 if (Pmode == DImode)
3787 ix86_gen_add3 = gen_adddi3;
3788 ix86_gen_sub3 = gen_subdi3;
3789 ix86_gen_sub3_carry = gen_subdi3_carry;
3790 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
3791 ix86_gen_andsp = gen_anddi3;
3792 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
3793 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
3794 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
3796 else
3798 ix86_gen_add3 = gen_addsi3;
3799 ix86_gen_sub3 = gen_subsi3;
3800 ix86_gen_sub3_carry = gen_subsi3_carry;
3801 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
3802 ix86_gen_andsp = gen_andsi3;
3803 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
3804 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
3805 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
3808 #ifdef USE_IX86_CLD
3809 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
3810 if (!TARGET_64BIT)
3811 target_flags |= MASK_CLD & ~target_flags_explicit;
3812 #endif
3814 if (!TARGET_64BIT && flag_pic)
3816 if (flag_fentry > 0)
3817 sorry ("-mfentry isn%'t supported for 32-bit in combination "
3818 "with -fpic");
3819 flag_fentry = 0;
3821 else if (TARGET_SEH)
3823 if (flag_fentry == 0)
3824 sorry ("-mno-fentry isn%'t compatible with SEH");
3825 flag_fentry = 1;
3827 else if (flag_fentry < 0)
3829 #if defined(PROFILE_BEFORE_PROLOGUE)
3830 flag_fentry = 1;
3831 #else
3832 flag_fentry = 0;
3833 #endif
3836 if (TARGET_AVX)
3838 /* When not optimize for size, enable vzeroupper optimization for
3839 TARGET_AVX with -fexpensive-optimizations and split 32-byte
3840 AVX unaligned load/store. */
3841 if (!optimize_size)
3843 if (flag_expensive_optimizations
3844 && !(target_flags_explicit & MASK_VZEROUPPER))
3845 target_flags |= MASK_VZEROUPPER;
3846 if ((x86_avx256_split_unaligned_load & ix86_tune_mask)
3847 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
3848 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
3849 if ((x86_avx256_split_unaligned_store & ix86_tune_mask)
3850 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE))
3851 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
3852 /* Enable 128-bit AVX instruction generation
3853 for the auto-vectorizer. */
3854 if (TARGET_AVX128_OPTIMAL
3855 && !(target_flags_explicit & MASK_PREFER_AVX128))
3856 target_flags |= MASK_PREFER_AVX128;
3859 else
3861 /* Disable vzeroupper pass if TARGET_AVX is disabled. */
3862 target_flags &= ~MASK_VZEROUPPER;
3865 if (ix86_recip_name)
3867 char *p = ASTRDUP (ix86_recip_name);
3868 char *q;
3869 unsigned int mask, i;
3870 bool invert;
3872 while ((q = strtok (p, ",")) != NULL)
3874 p = NULL;
3875 if (*q == '!')
3877 invert = true;
3878 q++;
3880 else
3881 invert = false;
3883 if (!strcmp (q, "default"))
3884 mask = RECIP_MASK_ALL;
3885 else
3887 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
3888 if (!strcmp (q, recip_options[i].string))
3890 mask = recip_options[i].mask;
3891 break;
3894 if (i == ARRAY_SIZE (recip_options))
3896 error ("unknown option for -mrecip=%s", q);
3897 invert = false;
3898 mask = RECIP_MASK_NONE;
3902 recip_mask_explicit |= mask;
3903 if (invert)
3904 recip_mask &= ~mask;
3905 else
3906 recip_mask |= mask;
3910 if (TARGET_RECIP)
3911 recip_mask |= RECIP_MASK_ALL & ~recip_mask_explicit;
3912 else if (target_flags_explicit & MASK_RECIP)
3913 recip_mask &= ~(RECIP_MASK_ALL & ~recip_mask_explicit);
3915 /* Default long double to 64-bit for Bionic. */
3916 if (TARGET_HAS_BIONIC
3917 && !(target_flags_explicit & MASK_LONG_DOUBLE_64))
3918 target_flags |= MASK_LONG_DOUBLE_64;
3920 /* Save the initial options in case the user does function specific
3921 options. */
3922 if (main_args_p)
3923 target_option_default_node = target_option_current_node
3924 = build_target_option_node ();
3927 /* Implement the TARGET_OPTION_OVERRIDE hook. */
3929 static void
3930 ix86_option_override (void)
3932 static struct register_pass_info insert_vzeroupper_info
3933 = { &pass_insert_vzeroupper.pass, "reload",
3934 1, PASS_POS_INSERT_AFTER
3937 ix86_option_override_internal (true);
3940 /* This needs to be done at start up. It's convenient to do it here. */
3941 register_pass (&insert_vzeroupper_info);
3944 /* Update register usage after having seen the compiler flags. */
3946 static void
3947 ix86_conditional_register_usage (void)
3949 int i, c_mask;
3950 unsigned int j;
3952 /* The PIC register, if it exists, is fixed. */
3953 j = PIC_OFFSET_TABLE_REGNUM;
3954 if (j != INVALID_REGNUM)
3955 fixed_regs[j] = call_used_regs[j] = 1;
3957 /* For 32-bit targets, squash the REX registers. */
3958 if (! TARGET_64BIT)
3960 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
3961 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3962 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
3963 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3966 /* See the definition of CALL_USED_REGISTERS in i386.h. */
3967 c_mask = (TARGET_64BIT_MS_ABI ? (1 << 3)
3968 : TARGET_64BIT ? (1 << 2)
3969 : (1 << 1));
3971 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
3973 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3975 /* Set/reset conditionally defined registers from
3976 CALL_USED_REGISTERS initializer. */
3977 if (call_used_regs[i] > 1)
3978 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
3980 /* Calculate registers of CLOBBERED_REGS register set
3981 as call used registers from GENERAL_REGS register set. */
3982 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
3983 && call_used_regs[i])
3984 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
3987 /* If MMX is disabled, squash the registers. */
3988 if (! TARGET_MMX)
3989 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3990 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
3991 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3993 /* If SSE is disabled, squash the registers. */
3994 if (! TARGET_SSE)
3995 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3996 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
3997 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3999 /* If the FPU is disabled, squash the registers. */
4000 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
4001 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4002 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
4003 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4007 /* Save the current options */
4009 static void
4010 ix86_function_specific_save (struct cl_target_option *ptr)
4012 ptr->arch = ix86_arch;
4013 ptr->schedule = ix86_schedule;
4014 ptr->tune = ix86_tune;
4015 ptr->branch_cost = ix86_branch_cost;
4016 ptr->tune_defaulted = ix86_tune_defaulted;
4017 ptr->arch_specified = ix86_arch_specified;
4018 ptr->x_ix86_isa_flags_explicit = ix86_isa_flags_explicit;
4019 ptr->ix86_target_flags_explicit = target_flags_explicit;
4020 ptr->x_recip_mask_explicit = recip_mask_explicit;
4022 /* The fields are char but the variables are not; make sure the
4023 values fit in the fields. */
4024 gcc_assert (ptr->arch == ix86_arch);
4025 gcc_assert (ptr->schedule == ix86_schedule);
4026 gcc_assert (ptr->tune == ix86_tune);
4027 gcc_assert (ptr->branch_cost == ix86_branch_cost);
4030 /* Restore the current options */
4032 static void
4033 ix86_function_specific_restore (struct cl_target_option *ptr)
4035 enum processor_type old_tune = ix86_tune;
4036 enum processor_type old_arch = ix86_arch;
4037 unsigned int ix86_arch_mask, ix86_tune_mask;
4038 int i;
4040 ix86_arch = (enum processor_type) ptr->arch;
4041 ix86_schedule = (enum attr_cpu) ptr->schedule;
4042 ix86_tune = (enum processor_type) ptr->tune;
4043 ix86_branch_cost = ptr->branch_cost;
4044 ix86_tune_defaulted = ptr->tune_defaulted;
4045 ix86_arch_specified = ptr->arch_specified;
4046 ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
4047 target_flags_explicit = ptr->ix86_target_flags_explicit;
4048 recip_mask_explicit = ptr->x_recip_mask_explicit;
4050 /* Recreate the arch feature tests if the arch changed */
4051 if (old_arch != ix86_arch)
4053 ix86_arch_mask = 1u << ix86_arch;
4054 for (i = 0; i < X86_ARCH_LAST; ++i)
4055 ix86_arch_features[i]
4056 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4059 /* Recreate the tune optimization tests */
4060 if (old_tune != ix86_tune)
4062 ix86_tune_mask = 1u << ix86_tune;
4063 for (i = 0; i < X86_TUNE_LAST; ++i)
4064 ix86_tune_features[i]
4065 = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
4069 /* Print the current options */
4071 static void
4072 ix86_function_specific_print (FILE *file, int indent,
4073 struct cl_target_option *ptr)
4075 char *target_string
4076 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
4077 NULL, NULL, ptr->x_ix86_fpmath, false);
4079 fprintf (file, "%*sarch = %d (%s)\n",
4080 indent, "",
4081 ptr->arch,
4082 ((ptr->arch < TARGET_CPU_DEFAULT_max)
4083 ? cpu_names[ptr->arch]
4084 : "<unknown>"));
4086 fprintf (file, "%*stune = %d (%s)\n",
4087 indent, "",
4088 ptr->tune,
4089 ((ptr->tune < TARGET_CPU_DEFAULT_max)
4090 ? cpu_names[ptr->tune]
4091 : "<unknown>"));
4093 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
4095 if (target_string)
4097 fprintf (file, "%*s%s\n", indent, "", target_string);
4098 free (target_string);
4103 /* Inner function to process the attribute((target(...))), take an argument and
4104 set the current options from the argument. If we have a list, recursively go
4105 over the list. */
4107 static bool
4108 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4109 struct gcc_options *enum_opts_set)
4111 char *next_optstr;
4112 bool ret = true;
4114 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4115 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4116 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4117 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4118 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
4120 enum ix86_opt_type
4122 ix86_opt_unknown,
4123 ix86_opt_yes,
4124 ix86_opt_no,
4125 ix86_opt_str,
4126 ix86_opt_enum,
4127 ix86_opt_isa
4130 static const struct
4132 const char *string;
4133 size_t len;
4134 enum ix86_opt_type type;
4135 int opt;
4136 int mask;
4137 } attrs[] = {
4138 /* isa options */
4139 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
4140 IX86_ATTR_ISA ("abm", OPT_mabm),
4141 IX86_ATTR_ISA ("bmi", OPT_mbmi),
4142 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
4143 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
4144 IX86_ATTR_ISA ("tbm", OPT_mtbm),
4145 IX86_ATTR_ISA ("aes", OPT_maes),
4146 IX86_ATTR_ISA ("avx", OPT_mavx),
4147 IX86_ATTR_ISA ("avx2", OPT_mavx2),
4148 IX86_ATTR_ISA ("mmx", OPT_mmmx),
4149 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
4150 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
4151 IX86_ATTR_ISA ("sse", OPT_msse),
4152 IX86_ATTR_ISA ("sse2", OPT_msse2),
4153 IX86_ATTR_ISA ("sse3", OPT_msse3),
4154 IX86_ATTR_ISA ("sse4", OPT_msse4),
4155 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
4156 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
4157 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
4158 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
4159 IX86_ATTR_ISA ("fma4", OPT_mfma4),
4160 IX86_ATTR_ISA ("fma", OPT_mfma),
4161 IX86_ATTR_ISA ("xop", OPT_mxop),
4162 IX86_ATTR_ISA ("lwp", OPT_mlwp),
4163 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
4164 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
4165 IX86_ATTR_ISA ("f16c", OPT_mf16c),
4166 IX86_ATTR_ISA ("rtm", OPT_mrtm),
4167 IX86_ATTR_ISA ("hle", OPT_mhle),
4168 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
4169 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
4170 IX86_ATTR_ISA ("adx", OPT_madx),
4171 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
4172 IX86_ATTR_ISA ("xsave", OPT_mxsave),
4173 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
4175 /* enum options */
4176 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
4178 /* string options */
4179 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
4180 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
4182 /* flag options */
4183 IX86_ATTR_YES ("cld",
4184 OPT_mcld,
4185 MASK_CLD),
4187 IX86_ATTR_NO ("fancy-math-387",
4188 OPT_mfancy_math_387,
4189 MASK_NO_FANCY_MATH_387),
4191 IX86_ATTR_YES ("ieee-fp",
4192 OPT_mieee_fp,
4193 MASK_IEEE_FP),
4195 IX86_ATTR_YES ("inline-all-stringops",
4196 OPT_minline_all_stringops,
4197 MASK_INLINE_ALL_STRINGOPS),
4199 IX86_ATTR_YES ("inline-stringops-dynamically",
4200 OPT_minline_stringops_dynamically,
4201 MASK_INLINE_STRINGOPS_DYNAMICALLY),
4203 IX86_ATTR_NO ("align-stringops",
4204 OPT_mno_align_stringops,
4205 MASK_NO_ALIGN_STRINGOPS),
4207 IX86_ATTR_YES ("recip",
4208 OPT_mrecip,
4209 MASK_RECIP),
4213 /* If this is a list, recurse to get the options. */
4214 if (TREE_CODE (args) == TREE_LIST)
4216 bool ret = true;
4218 for (; args; args = TREE_CHAIN (args))
4219 if (TREE_VALUE (args)
4220 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4221 p_strings, enum_opts_set))
4222 ret = false;
4224 return ret;
4227 else if (TREE_CODE (args) != STRING_CST)
4229 error ("attribute %<target%> argument not a string");
4230 return false;
4233 /* Handle multiple arguments separated by commas. */
4234 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4236 while (next_optstr && *next_optstr != '\0')
4238 char *p = next_optstr;
4239 char *orig_p = p;
4240 char *comma = strchr (next_optstr, ',');
4241 const char *opt_string;
4242 size_t len, opt_len;
4243 int opt;
4244 bool opt_set_p;
4245 char ch;
4246 unsigned i;
4247 enum ix86_opt_type type = ix86_opt_unknown;
4248 int mask = 0;
4250 if (comma)
4252 *comma = '\0';
4253 len = comma - next_optstr;
4254 next_optstr = comma + 1;
4256 else
4258 len = strlen (p);
4259 next_optstr = NULL;
4262 /* Recognize no-xxx. */
4263 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4265 opt_set_p = false;
4266 p += 3;
4267 len -= 3;
4269 else
4270 opt_set_p = true;
4272 /* Find the option. */
4273 ch = *p;
4274 opt = N_OPTS;
4275 for (i = 0; i < ARRAY_SIZE (attrs); i++)
4277 type = attrs[i].type;
4278 opt_len = attrs[i].len;
4279 if (ch == attrs[i].string[0]
4280 && ((type != ix86_opt_str && type != ix86_opt_enum)
4281 ? len == opt_len
4282 : len > opt_len)
4283 && memcmp (p, attrs[i].string, opt_len) == 0)
4285 opt = attrs[i].opt;
4286 mask = attrs[i].mask;
4287 opt_string = attrs[i].string;
4288 break;
4292 /* Process the option. */
4293 if (opt == N_OPTS)
4295 error ("attribute(target(\"%s\")) is unknown", orig_p);
4296 ret = false;
4299 else if (type == ix86_opt_isa)
4301 struct cl_decoded_option decoded;
4303 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4304 ix86_handle_option (&global_options, &global_options_set,
4305 &decoded, input_location);
4308 else if (type == ix86_opt_yes || type == ix86_opt_no)
4310 if (type == ix86_opt_no)
4311 opt_set_p = !opt_set_p;
4313 if (opt_set_p)
4314 target_flags |= mask;
4315 else
4316 target_flags &= ~mask;
4319 else if (type == ix86_opt_str)
4321 if (p_strings[opt])
4323 error ("option(\"%s\") was already specified", opt_string);
4324 ret = false;
4326 else
4327 p_strings[opt] = xstrdup (p + opt_len);
4330 else if (type == ix86_opt_enum)
4332 bool arg_ok;
4333 int value;
4335 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4336 if (arg_ok)
4337 set_option (&global_options, enum_opts_set, opt, value,
4338 p + opt_len, DK_UNSPECIFIED, input_location,
4339 global_dc);
4340 else
4342 error ("attribute(target(\"%s\")) is unknown", orig_p);
4343 ret = false;
4347 else
4348 gcc_unreachable ();
4351 return ret;
4354 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
4356 tree
4357 ix86_valid_target_attribute_tree (tree args)
4359 const char *orig_arch_string = ix86_arch_string;
4360 const char *orig_tune_string = ix86_tune_string;
4361 enum fpmath_unit orig_fpmath_set = global_options_set.x_ix86_fpmath;
4362 int orig_tune_defaulted = ix86_tune_defaulted;
4363 int orig_arch_specified = ix86_arch_specified;
4364 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4365 tree t = NULL_TREE;
4366 int i;
4367 struct cl_target_option *def
4368 = TREE_TARGET_OPTION (target_option_default_node);
4369 struct gcc_options enum_opts_set;
4371 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4373 /* Process each of the options on the chain. */
4374 if (! ix86_valid_target_attribute_inner_p (args, option_strings,
4375 &enum_opts_set))
4376 return error_mark_node;
4378 /* If the changed options are different from the default, rerun
4379 ix86_option_override_internal, and then save the options away.
4380 The string options are are attribute options, and will be undone
4381 when we copy the save structure. */
4382 if (ix86_isa_flags != def->x_ix86_isa_flags
4383 || target_flags != def->x_target_flags
4384 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4385 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4386 || enum_opts_set.x_ix86_fpmath)
4388 /* If we are using the default tune= or arch=, undo the string assigned,
4389 and use the default. */
4390 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4391 ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4392 else if (!orig_arch_specified)
4393 ix86_arch_string = NULL;
4395 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4396 ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4397 else if (orig_tune_defaulted)
4398 ix86_tune_string = NULL;
4400 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
4401 if (enum_opts_set.x_ix86_fpmath)
4402 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4403 else if (!TARGET_64BIT && TARGET_SSE)
4405 ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4406 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4409 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
4410 ix86_option_override_internal (false);
4412 /* Add any builtin functions with the new isa if any. */
4413 ix86_add_new_builtins (ix86_isa_flags);
4415 /* Save the current options unless we are validating options for
4416 #pragma. */
4417 t = build_target_option_node ();
4419 ix86_arch_string = orig_arch_string;
4420 ix86_tune_string = orig_tune_string;
4421 global_options_set.x_ix86_fpmath = orig_fpmath_set;
4423 /* Free up memory allocated to hold the strings */
4424 for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4425 free (option_strings[i]);
4428 return t;
4431 /* Hook to validate attribute((target("string"))). */
4433 static bool
4434 ix86_valid_target_attribute_p (tree fndecl,
4435 tree ARG_UNUSED (name),
4436 tree args,
4437 int ARG_UNUSED (flags))
4439 struct cl_target_option cur_target;
4440 bool ret = true;
4442 /* attribute((target("default"))) does nothing, beyond
4443 affecting multi-versioning. */
4444 if (TREE_VALUE (args)
4445 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
4446 && TREE_CHAIN (args) == NULL_TREE
4447 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
4448 return true;
4450 tree old_optimize = build_optimization_node ();
4451 tree new_target, new_optimize;
4452 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4454 /* If the function changed the optimization levels as well as setting target
4455 options, start with the optimizations specified. */
4456 if (func_optimize && func_optimize != old_optimize)
4457 cl_optimization_restore (&global_options,
4458 TREE_OPTIMIZATION (func_optimize));
4460 /* The target attributes may also change some optimization flags, so update
4461 the optimization options if necessary. */
4462 cl_target_option_save (&cur_target, &global_options);
4463 new_target = ix86_valid_target_attribute_tree (args);
4464 new_optimize = build_optimization_node ();
4466 if (new_target == error_mark_node)
4467 ret = false;
4469 else if (fndecl && new_target)
4471 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4473 if (old_optimize != new_optimize)
4474 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4477 cl_target_option_restore (&global_options, &cur_target);
4479 if (old_optimize != new_optimize)
4480 cl_optimization_restore (&global_options,
4481 TREE_OPTIMIZATION (old_optimize));
4483 return ret;
4487 /* Hook to determine if one function can safely inline another. */
4489 static bool
4490 ix86_can_inline_p (tree caller, tree callee)
4492 bool ret = false;
4493 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4494 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4496 /* If callee has no option attributes, then it is ok to inline. */
4497 if (!callee_tree)
4498 ret = true;
4500 /* If caller has no option attributes, but callee does then it is not ok to
4501 inline. */
4502 else if (!caller_tree)
4503 ret = false;
4505 else
4507 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4508 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4510 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4511 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4512 function. */
4513 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4514 != callee_opts->x_ix86_isa_flags)
4515 ret = false;
4517 /* See if we have the same non-isa options. */
4518 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4519 ret = false;
4521 /* See if arch, tune, etc. are the same. */
4522 else if (caller_opts->arch != callee_opts->arch)
4523 ret = false;
4525 else if (caller_opts->tune != callee_opts->tune)
4526 ret = false;
4528 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4529 ret = false;
4531 else if (caller_opts->branch_cost != callee_opts->branch_cost)
4532 ret = false;
4534 else
4535 ret = true;
4538 return ret;
4542 /* Remember the last target of ix86_set_current_function. */
4543 static GTY(()) tree ix86_previous_fndecl;
4545 /* Establish appropriate back-end context for processing the function
4546 FNDECL. The argument might be NULL to indicate processing at top
4547 level, outside of any function scope. */
4548 static void
4549 ix86_set_current_function (tree fndecl)
4551 /* Only change the context if the function changes. This hook is called
4552 several times in the course of compiling a function, and we don't want to
4553 slow things down too much or call target_reinit when it isn't safe. */
4554 if (fndecl && fndecl != ix86_previous_fndecl)
4556 tree old_tree = (ix86_previous_fndecl
4557 ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4558 : NULL_TREE);
4560 tree new_tree = (fndecl
4561 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4562 : NULL_TREE);
4564 ix86_previous_fndecl = fndecl;
4565 if (old_tree == new_tree)
4568 else if (new_tree)
4570 cl_target_option_restore (&global_options,
4571 TREE_TARGET_OPTION (new_tree));
4572 target_reinit ();
4575 else if (old_tree)
4577 struct cl_target_option *def
4578 = TREE_TARGET_OPTION (target_option_current_node);
4580 cl_target_option_restore (&global_options, def);
4581 target_reinit ();
4587 /* Return true if this goes in large data/bss. */
4589 static bool
4590 ix86_in_large_data_p (tree exp)
4592 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4593 return false;
4595 /* Functions are never large data. */
4596 if (TREE_CODE (exp) == FUNCTION_DECL)
4597 return false;
4599 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4601 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4602 if (strcmp (section, ".ldata") == 0
4603 || strcmp (section, ".lbss") == 0)
4604 return true;
4605 return false;
4607 else
4609 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4611 /* If this is an incomplete type with size 0, then we can't put it
4612 in data because it might be too big when completed. */
4613 if (!size || size > ix86_section_threshold)
4614 return true;
4617 return false;
4620 /* Switch to the appropriate section for output of DECL.
4621 DECL is either a `VAR_DECL' node or a constant of some sort.
4622 RELOC indicates whether forming the initial value of DECL requires
4623 link-time relocations. */
4625 static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT)
4626 ATTRIBUTE_UNUSED;
4628 static section *
4629 x86_64_elf_select_section (tree decl, int reloc,
4630 unsigned HOST_WIDE_INT align)
4632 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4633 && ix86_in_large_data_p (decl))
4635 const char *sname = NULL;
4636 unsigned int flags = SECTION_WRITE;
4637 switch (categorize_decl_for_section (decl, reloc))
4639 case SECCAT_DATA:
4640 sname = ".ldata";
4641 break;
4642 case SECCAT_DATA_REL:
4643 sname = ".ldata.rel";
4644 break;
4645 case SECCAT_DATA_REL_LOCAL:
4646 sname = ".ldata.rel.local";
4647 break;
4648 case SECCAT_DATA_REL_RO:
4649 sname = ".ldata.rel.ro";
4650 break;
4651 case SECCAT_DATA_REL_RO_LOCAL:
4652 sname = ".ldata.rel.ro.local";
4653 break;
4654 case SECCAT_BSS:
4655 sname = ".lbss";
4656 flags |= SECTION_BSS;
4657 break;
4658 case SECCAT_RODATA:
4659 case SECCAT_RODATA_MERGE_STR:
4660 case SECCAT_RODATA_MERGE_STR_INIT:
4661 case SECCAT_RODATA_MERGE_CONST:
4662 sname = ".lrodata";
4663 flags = 0;
4664 break;
4665 case SECCAT_SRODATA:
4666 case SECCAT_SDATA:
4667 case SECCAT_SBSS:
4668 gcc_unreachable ();
4669 case SECCAT_TEXT:
4670 case SECCAT_TDATA:
4671 case SECCAT_TBSS:
4672 /* We don't split these for medium model. Place them into
4673 default sections and hope for best. */
4674 break;
4676 if (sname)
4678 /* We might get called with string constants, but get_named_section
4679 doesn't like them as they are not DECLs. Also, we need to set
4680 flags in that case. */
4681 if (!DECL_P (decl))
4682 return get_section (sname, flags, NULL);
4683 return get_named_section (decl, sname, reloc);
4686 return default_elf_select_section (decl, reloc, align);
4689 /* Build up a unique section name, expressed as a
4690 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
4691 RELOC indicates whether the initial value of EXP requires
4692 link-time relocations. */
4694 static void ATTRIBUTE_UNUSED
4695 x86_64_elf_unique_section (tree decl, int reloc)
4697 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4698 && ix86_in_large_data_p (decl))
4700 const char *prefix = NULL;
4701 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
4702 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
4704 switch (categorize_decl_for_section (decl, reloc))
4706 case SECCAT_DATA:
4707 case SECCAT_DATA_REL:
4708 case SECCAT_DATA_REL_LOCAL:
4709 case SECCAT_DATA_REL_RO:
4710 case SECCAT_DATA_REL_RO_LOCAL:
4711 prefix = one_only ? ".ld" : ".ldata";
4712 break;
4713 case SECCAT_BSS:
4714 prefix = one_only ? ".lb" : ".lbss";
4715 break;
4716 case SECCAT_RODATA:
4717 case SECCAT_RODATA_MERGE_STR:
4718 case SECCAT_RODATA_MERGE_STR_INIT:
4719 case SECCAT_RODATA_MERGE_CONST:
4720 prefix = one_only ? ".lr" : ".lrodata";
4721 break;
4722 case SECCAT_SRODATA:
4723 case SECCAT_SDATA:
4724 case SECCAT_SBSS:
4725 gcc_unreachable ();
4726 case SECCAT_TEXT:
4727 case SECCAT_TDATA:
4728 case SECCAT_TBSS:
4729 /* We don't split these for medium model. Place them into
4730 default sections and hope for best. */
4731 break;
4733 if (prefix)
4735 const char *name, *linkonce;
4736 char *string;
4738 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
4739 name = targetm.strip_name_encoding (name);
4741 /* If we're using one_only, then there needs to be a .gnu.linkonce
4742 prefix to the section name. */
4743 linkonce = one_only ? ".gnu.linkonce" : "";
4745 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
4747 DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
4748 return;
4751 default_unique_section (decl, reloc);
4754 #ifdef COMMON_ASM_OP
4755 /* This says how to output assembler code to declare an
4756 uninitialized external linkage data object.
4758 For medium model x86-64 we need to use .largecomm opcode for
4759 large objects. */
4760 void
4761 x86_elf_aligned_common (FILE *file,
4762 const char *name, unsigned HOST_WIDE_INT size,
4763 int align)
4765 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4766 && size > (unsigned int)ix86_section_threshold)
4767 fputs (".largecomm\t", file);
4768 else
4769 fputs (COMMON_ASM_OP, file);
4770 assemble_name (file, name);
4771 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
4772 size, align / BITS_PER_UNIT);
4774 #endif
4776 /* Utility function for targets to use in implementing
4777 ASM_OUTPUT_ALIGNED_BSS. */
4779 void
4780 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
4781 const char *name, unsigned HOST_WIDE_INT size,
4782 int align)
4784 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4785 && size > (unsigned int)ix86_section_threshold)
4786 switch_to_section (get_named_section (decl, ".lbss", 0));
4787 else
4788 switch_to_section (bss_section);
4789 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
4790 #ifdef ASM_DECLARE_OBJECT_NAME
4791 last_assemble_variable_decl = decl;
4792 ASM_DECLARE_OBJECT_NAME (file, name, decl);
4793 #else
4794 /* Standard thing is just output label for the object. */
4795 ASM_OUTPUT_LABEL (file, name);
4796 #endif /* ASM_DECLARE_OBJECT_NAME */
4797 ASM_OUTPUT_SKIP (file, size ? size : 1);
4800 /* Decide whether we must probe the stack before any space allocation
4801 on this target. It's essentially TARGET_STACK_PROBE except when
4802 -fstack-check causes the stack to be already probed differently. */
4804 bool
4805 ix86_target_stack_probe (void)
4807 /* Do not probe the stack twice if static stack checking is enabled. */
4808 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4809 return false;
4811 return TARGET_STACK_PROBE;
4814 /* Decide whether we can make a sibling call to a function. DECL is the
4815 declaration of the function being targeted by the call and EXP is the
4816 CALL_EXPR representing the call. */
4818 static bool
4819 ix86_function_ok_for_sibcall (tree decl, tree exp)
4821 tree type, decl_or_type;
4822 rtx a, b;
4824 /* If we are generating position-independent code, we cannot sibcall
4825 optimize any indirect call, or a direct call to a global function,
4826 as the PLT requires %ebx be live. (Darwin does not have a PLT.) */
4827 if (!TARGET_MACHO
4828 && !TARGET_64BIT
4829 && flag_pic
4830 && (!decl || !targetm.binds_local_p (decl)))
4831 return false;
4833 /* If we need to align the outgoing stack, then sibcalling would
4834 unalign the stack, which may break the called function. */
4835 if (ix86_minimum_incoming_stack_boundary (true)
4836 < PREFERRED_STACK_BOUNDARY)
4837 return false;
4839 if (decl)
4841 decl_or_type = decl;
4842 type = TREE_TYPE (decl);
4844 else
4846 /* We're looking at the CALL_EXPR, we need the type of the function. */
4847 type = CALL_EXPR_FN (exp); /* pointer expression */
4848 type = TREE_TYPE (type); /* pointer type */
4849 type = TREE_TYPE (type); /* function type */
4850 decl_or_type = type;
4853 /* Check that the return value locations are the same. Like
4854 if we are returning floats on the 80387 register stack, we cannot
4855 make a sibcall from a function that doesn't return a float to a
4856 function that does or, conversely, from a function that does return
4857 a float to a function that doesn't; the necessary stack adjustment
4858 would not be executed. This is also the place we notice
4859 differences in the return value ABI. Note that it is ok for one
4860 of the functions to have void return type as long as the return
4861 value of the other is passed in a register. */
4862 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
4863 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
4864 cfun->decl, false);
4865 if (STACK_REG_P (a) || STACK_REG_P (b))
4867 if (!rtx_equal_p (a, b))
4868 return false;
4870 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
4872 else if (!rtx_equal_p (a, b))
4873 return false;
4875 if (TARGET_64BIT)
4877 /* The SYSV ABI has more call-clobbered registers;
4878 disallow sibcalls from MS to SYSV. */
4879 if (cfun->machine->call_abi == MS_ABI
4880 && ix86_function_type_abi (type) == SYSV_ABI)
4881 return false;
4883 else
4885 /* If this call is indirect, we'll need to be able to use a
4886 call-clobbered register for the address of the target function.
4887 Make sure that all such registers are not used for passing
4888 parameters. Note that DLLIMPORT functions are indirect. */
4889 if (!decl
4890 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
4892 if (ix86_function_regparm (type, NULL) >= 3)
4894 /* ??? Need to count the actual number of registers to be used,
4895 not the possible number of registers. Fix later. */
4896 return false;
4901 /* Otherwise okay. That also includes certain types of indirect calls. */
4902 return true;
4905 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
4906 and "sseregparm" calling convention attributes;
4907 arguments as in struct attribute_spec.handler. */
4909 static tree
4910 ix86_handle_cconv_attribute (tree *node, tree name,
4911 tree args,
4912 int flags ATTRIBUTE_UNUSED,
4913 bool *no_add_attrs)
4915 if (TREE_CODE (*node) != FUNCTION_TYPE
4916 && TREE_CODE (*node) != METHOD_TYPE
4917 && TREE_CODE (*node) != FIELD_DECL
4918 && TREE_CODE (*node) != TYPE_DECL)
4920 warning (OPT_Wattributes, "%qE attribute only applies to functions",
4921 name);
4922 *no_add_attrs = true;
4923 return NULL_TREE;
4926 /* Can combine regparm with all attributes but fastcall, and thiscall. */
4927 if (is_attribute_p ("regparm", name))
4929 tree cst;
4931 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4933 error ("fastcall and regparm attributes are not compatible");
4936 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4938 error ("regparam and thiscall attributes are not compatible");
4941 cst = TREE_VALUE (args);
4942 if (TREE_CODE (cst) != INTEGER_CST)
4944 warning (OPT_Wattributes,
4945 "%qE attribute requires an integer constant argument",
4946 name);
4947 *no_add_attrs = true;
4949 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
4951 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
4952 name, REGPARM_MAX);
4953 *no_add_attrs = true;
4956 return NULL_TREE;
4959 if (TARGET_64BIT)
4961 /* Do not warn when emulating the MS ABI. */
4962 if ((TREE_CODE (*node) != FUNCTION_TYPE
4963 && TREE_CODE (*node) != METHOD_TYPE)
4964 || ix86_function_type_abi (*node) != MS_ABI)
4965 warning (OPT_Wattributes, "%qE attribute ignored",
4966 name);
4967 *no_add_attrs = true;
4968 return NULL_TREE;
4971 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
4972 if (is_attribute_p ("fastcall", name))
4974 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4976 error ("fastcall and cdecl attributes are not compatible");
4978 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
4980 error ("fastcall and stdcall attributes are not compatible");
4982 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
4984 error ("fastcall and regparm attributes are not compatible");
4986 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4988 error ("fastcall and thiscall attributes are not compatible");
4992 /* Can combine stdcall with fastcall (redundant), regparm and
4993 sseregparm. */
4994 else if (is_attribute_p ("stdcall", name))
4996 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4998 error ("stdcall and cdecl attributes are not compatible");
5000 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5002 error ("stdcall and fastcall attributes are not compatible");
5004 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5006 error ("stdcall and thiscall attributes are not compatible");
5010 /* Can combine cdecl with regparm and sseregparm. */
5011 else if (is_attribute_p ("cdecl", name))
5013 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5015 error ("stdcall and cdecl attributes are not compatible");
5017 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5019 error ("fastcall and cdecl attributes are not compatible");
5021 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5023 error ("cdecl and thiscall attributes are not compatible");
5026 else if (is_attribute_p ("thiscall", name))
5028 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
5029 warning (OPT_Wattributes, "%qE attribute is used for none class-method",
5030 name);
5031 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5033 error ("stdcall and thiscall attributes are not compatible");
5035 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5037 error ("fastcall and thiscall attributes are not compatible");
5039 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5041 error ("cdecl and thiscall attributes are not compatible");
5045 /* Can combine sseregparm with all attributes. */
5047 return NULL_TREE;
5050 /* The transactional memory builtins are implicitly regparm or fastcall
5051 depending on the ABI. Override the generic do-nothing attribute that
5052 these builtins were declared with, and replace it with one of the two
5053 attributes that we expect elsewhere. */
5055 static tree
5056 ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
5057 tree args ATTRIBUTE_UNUSED,
5058 int flags ATTRIBUTE_UNUSED,
5059 bool *no_add_attrs)
5061 tree alt;
5063 /* In no case do we want to add the placeholder attribute. */
5064 *no_add_attrs = true;
5066 /* The 64-bit ABI is unchanged for transactional memory. */
5067 if (TARGET_64BIT)
5068 return NULL_TREE;
5070 /* ??? Is there a better way to validate 32-bit windows? We have
5071 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
5072 if (CHECK_STACK_LIMIT > 0)
5073 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
5074 else
5076 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
5077 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
5079 decl_attributes (node, alt, flags);
5081 return NULL_TREE;
5084 /* This function determines from TYPE the calling-convention. */
5086 unsigned int
5087 ix86_get_callcvt (const_tree type)
5089 unsigned int ret = 0;
5090 bool is_stdarg;
5091 tree attrs;
5093 if (TARGET_64BIT)
5094 return IX86_CALLCVT_CDECL;
5096 attrs = TYPE_ATTRIBUTES (type);
5097 if (attrs != NULL_TREE)
5099 if (lookup_attribute ("cdecl", attrs))
5100 ret |= IX86_CALLCVT_CDECL;
5101 else if (lookup_attribute ("stdcall", attrs))
5102 ret |= IX86_CALLCVT_STDCALL;
5103 else if (lookup_attribute ("fastcall", attrs))
5104 ret |= IX86_CALLCVT_FASTCALL;
5105 else if (lookup_attribute ("thiscall", attrs))
5106 ret |= IX86_CALLCVT_THISCALL;
5108 /* Regparam isn't allowed for thiscall and fastcall. */
5109 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
5111 if (lookup_attribute ("regparm", attrs))
5112 ret |= IX86_CALLCVT_REGPARM;
5113 if (lookup_attribute ("sseregparm", attrs))
5114 ret |= IX86_CALLCVT_SSEREGPARM;
5117 if (IX86_BASE_CALLCVT(ret) != 0)
5118 return ret;
5121 is_stdarg = stdarg_p (type);
5122 if (TARGET_RTD && !is_stdarg)
5123 return IX86_CALLCVT_STDCALL | ret;
5125 if (ret != 0
5126 || is_stdarg
5127 || TREE_CODE (type) != METHOD_TYPE
5128 || ix86_function_type_abi (type) != MS_ABI)
5129 return IX86_CALLCVT_CDECL | ret;
5131 return IX86_CALLCVT_THISCALL;
5134 /* Return 0 if the attributes for two types are incompatible, 1 if they
5135 are compatible, and 2 if they are nearly compatible (which causes a
5136 warning to be generated). */
5138 static int
5139 ix86_comp_type_attributes (const_tree type1, const_tree type2)
5141 unsigned int ccvt1, ccvt2;
5143 if (TREE_CODE (type1) != FUNCTION_TYPE
5144 && TREE_CODE (type1) != METHOD_TYPE)
5145 return 1;
5147 ccvt1 = ix86_get_callcvt (type1);
5148 ccvt2 = ix86_get_callcvt (type2);
5149 if (ccvt1 != ccvt2)
5150 return 0;
5151 if (ix86_function_regparm (type1, NULL)
5152 != ix86_function_regparm (type2, NULL))
5153 return 0;
5155 return 1;
5158 /* Return the regparm value for a function with the indicated TYPE and DECL.
5159 DECL may be NULL when calling function indirectly
5160 or considering a libcall. */
5162 static int
5163 ix86_function_regparm (const_tree type, const_tree decl)
5165 tree attr;
5166 int regparm;
5167 unsigned int ccvt;
5169 if (TARGET_64BIT)
5170 return (ix86_function_type_abi (type) == SYSV_ABI
5171 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5172 ccvt = ix86_get_callcvt (type);
5173 regparm = ix86_regparm;
5175 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5177 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5178 if (attr)
5180 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
5181 return regparm;
5184 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5185 return 2;
5186 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5187 return 1;
5189 /* Use register calling convention for local functions when possible. */
5190 if (decl
5191 && TREE_CODE (decl) == FUNCTION_DECL
5192 && optimize
5193 && !(profile_flag && !flag_fentry))
5195 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5196 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5197 if (i && i->local && i->can_change_signature)
5199 int local_regparm, globals = 0, regno;
5201 /* Make sure no regparm register is taken by a
5202 fixed register variable. */
5203 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5204 if (fixed_regs[local_regparm])
5205 break;
5207 /* We don't want to use regparm(3) for nested functions as
5208 these use a static chain pointer in the third argument. */
5209 if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5210 local_regparm = 2;
5212 /* In 32-bit mode save a register for the split stack. */
5213 if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5214 local_regparm = 2;
5216 /* Each fixed register usage increases register pressure,
5217 so less registers should be used for argument passing.
5218 This functionality can be overriden by an explicit
5219 regparm value. */
5220 for (regno = AX_REG; regno <= DI_REG; regno++)
5221 if (fixed_regs[regno])
5222 globals++;
5224 local_regparm
5225 = globals < local_regparm ? local_regparm - globals : 0;
5227 if (local_regparm > regparm)
5228 regparm = local_regparm;
5232 return regparm;
5235 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5236 DFmode (2) arguments in SSE registers for a function with the
5237 indicated TYPE and DECL. DECL may be NULL when calling function
5238 indirectly or considering a libcall. Otherwise return 0. */
5240 static int
5241 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5243 gcc_assert (!TARGET_64BIT);
5245 /* Use SSE registers to pass SFmode and DFmode arguments if requested
5246 by the sseregparm attribute. */
5247 if (TARGET_SSEREGPARM
5248 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5250 if (!TARGET_SSE)
5252 if (warn)
5254 if (decl)
5255 error ("calling %qD with attribute sseregparm without "
5256 "SSE/SSE2 enabled", decl);
5257 else
5258 error ("calling %qT with attribute sseregparm without "
5259 "SSE/SSE2 enabled", type);
5261 return 0;
5264 return 2;
5267 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5268 (and DFmode for SSE2) arguments in SSE registers. */
5269 if (decl && TARGET_SSE_MATH && optimize
5270 && !(profile_flag && !flag_fentry))
5272 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5273 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5274 if (i && i->local && i->can_change_signature)
5275 return TARGET_SSE2 ? 2 : 1;
5278 return 0;
5281 /* Return true if EAX is live at the start of the function. Used by
5282 ix86_expand_prologue to determine if we need special help before
5283 calling allocate_stack_worker. */
5285 static bool
5286 ix86_eax_live_at_start_p (void)
5288 /* Cheat. Don't bother working forward from ix86_function_regparm
5289 to the function type to whether an actual argument is located in
5290 eax. Instead just look at cfg info, which is still close enough
5291 to correct at this point. This gives false positives for broken
5292 functions that might use uninitialized data that happens to be
5293 allocated in eax, but who cares? */
5294 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
5297 static bool
5298 ix86_keep_aggregate_return_pointer (tree fntype)
5300 tree attr;
5302 if (!TARGET_64BIT)
5304 attr = lookup_attribute ("callee_pop_aggregate_return",
5305 TYPE_ATTRIBUTES (fntype));
5306 if (attr)
5307 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5309 /* For 32-bit MS-ABI the default is to keep aggregate
5310 return pointer. */
5311 if (ix86_function_type_abi (fntype) == MS_ABI)
5312 return true;
5314 return KEEP_AGGREGATE_RETURN_POINTER != 0;
5317 /* Value is the number of bytes of arguments automatically
5318 popped when returning from a subroutine call.
5319 FUNDECL is the declaration node of the function (as a tree),
5320 FUNTYPE is the data type of the function (as a tree),
5321 or for a library call it is an identifier node for the subroutine name.
5322 SIZE is the number of bytes of arguments passed on the stack.
5324 On the 80386, the RTD insn may be used to pop them if the number
5325 of args is fixed, but if the number is variable then the caller
5326 must pop them all. RTD can't be used for library calls now
5327 because the library is compiled with the Unix compiler.
5328 Use of RTD is a selectable option, since it is incompatible with
5329 standard Unix calling sequences. If the option is not selected,
5330 the caller must always pop the args.
5332 The attribute stdcall is equivalent to RTD on a per module basis. */
5334 static int
5335 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5337 unsigned int ccvt;
5339 /* None of the 64-bit ABIs pop arguments. */
5340 if (TARGET_64BIT)
5341 return 0;
5343 ccvt = ix86_get_callcvt (funtype);
5345 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5346 | IX86_CALLCVT_THISCALL)) != 0
5347 && ! stdarg_p (funtype))
5348 return size;
5350 /* Lose any fake structure return argument if it is passed on the stack. */
5351 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5352 && !ix86_keep_aggregate_return_pointer (funtype))
5354 int nregs = ix86_function_regparm (funtype, fundecl);
5355 if (nregs == 0)
5356 return GET_MODE_SIZE (Pmode);
5359 return 0;
5362 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
5364 static bool
5365 ix86_legitimate_combined_insn (rtx insn)
5367 /* Check operand constraints in case hard registers were propagated
5368 into insn pattern. This check prevents combine pass from
5369 generating insn patterns with invalid hard register operands.
5370 These invalid insns can eventually confuse reload to error out
5371 with a spill failure. See also PRs 46829 and 46843. */
5372 if ((INSN_CODE (insn) = recog (PATTERN (insn), insn, 0)) >= 0)
5374 int i;
5376 extract_insn (insn);
5377 preprocess_constraints ();
5379 for (i = 0; i < recog_data.n_operands; i++)
5381 rtx op = recog_data.operand[i];
5382 enum machine_mode mode = GET_MODE (op);
5383 struct operand_alternative *op_alt;
5384 int offset = 0;
5385 bool win;
5386 int j;
5388 /* A unary operator may be accepted by the predicate, but it
5389 is irrelevant for matching constraints. */
5390 if (UNARY_P (op))
5391 op = XEXP (op, 0);
5393 if (GET_CODE (op) == SUBREG)
5395 if (REG_P (SUBREG_REG (op))
5396 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
5397 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
5398 GET_MODE (SUBREG_REG (op)),
5399 SUBREG_BYTE (op),
5400 GET_MODE (op));
5401 op = SUBREG_REG (op);
5404 if (!(REG_P (op) && HARD_REGISTER_P (op)))
5405 continue;
5407 op_alt = recog_op_alt[i];
5409 /* Operand has no constraints, anything is OK. */
5410 win = !recog_data.n_alternatives;
5412 for (j = 0; j < recog_data.n_alternatives; j++)
5414 if (op_alt[j].anything_ok
5415 || (op_alt[j].matches != -1
5416 && operands_match_p
5417 (recog_data.operand[i],
5418 recog_data.operand[op_alt[j].matches]))
5419 || reg_fits_class_p (op, op_alt[j].cl, offset, mode))
5421 win = true;
5422 break;
5426 if (!win)
5427 return false;
5431 return true;
5434 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
5436 static unsigned HOST_WIDE_INT
5437 ix86_asan_shadow_offset (void)
5439 return (unsigned HOST_WIDE_INT) 1 << (TARGET_LP64 ? 44 : 29);
5442 /* Argument support functions. */
5444 /* Return true when register may be used to pass function parameters. */
5445 bool
5446 ix86_function_arg_regno_p (int regno)
5448 int i;
5449 const int *parm_regs;
5451 if (!TARGET_64BIT)
5453 if (TARGET_MACHO)
5454 return (regno < REGPARM_MAX
5455 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5456 else
5457 return (regno < REGPARM_MAX
5458 || (TARGET_MMX && MMX_REGNO_P (regno)
5459 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5460 || (TARGET_SSE && SSE_REGNO_P (regno)
5461 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5464 if (TARGET_MACHO)
5466 if (SSE_REGNO_P (regno) && TARGET_SSE)
5467 return true;
5469 else
5471 if (TARGET_SSE && SSE_REGNO_P (regno)
5472 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5473 return true;
5476 /* TODO: The function should depend on current function ABI but
5477 builtins.c would need updating then. Therefore we use the
5478 default ABI. */
5480 /* RAX is used as hidden argument to va_arg functions. */
5481 if (ix86_abi == SYSV_ABI && regno == AX_REG)
5482 return true;
5484 if (ix86_abi == MS_ABI)
5485 parm_regs = x86_64_ms_abi_int_parameter_registers;
5486 else
5487 parm_regs = x86_64_int_parameter_registers;
5488 for (i = 0; i < (ix86_abi == MS_ABI
5489 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5490 if (regno == parm_regs[i])
5491 return true;
5492 return false;
5495 /* Return if we do not know how to pass TYPE solely in registers. */
5497 static bool
5498 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5500 if (must_pass_in_stack_var_size_or_pad (mode, type))
5501 return true;
5503 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
5504 The layout_type routine is crafty and tries to trick us into passing
5505 currently unsupported vector types on the stack by using TImode. */
5506 return (!TARGET_64BIT && mode == TImode
5507 && type && TREE_CODE (type) != VECTOR_TYPE);
5510 /* It returns the size, in bytes, of the area reserved for arguments passed
5511 in registers for the function represented by fndecl dependent to the used
5512 abi format. */
5514 ix86_reg_parm_stack_space (const_tree fndecl)
5516 enum calling_abi call_abi = SYSV_ABI;
5517 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5518 call_abi = ix86_function_abi (fndecl);
5519 else
5520 call_abi = ix86_function_type_abi (fndecl);
5521 if (TARGET_64BIT && call_abi == MS_ABI)
5522 return 32;
5523 return 0;
5526 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5527 call abi used. */
5528 enum calling_abi
5529 ix86_function_type_abi (const_tree fntype)
5531 if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5533 enum calling_abi abi = ix86_abi;
5534 if (abi == SYSV_ABI)
5536 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5537 abi = MS_ABI;
5539 else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5540 abi = SYSV_ABI;
5541 return abi;
5543 return ix86_abi;
5546 static bool
5547 ix86_function_ms_hook_prologue (const_tree fn)
5549 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5551 if (decl_function_context (fn) != NULL_TREE)
5552 error_at (DECL_SOURCE_LOCATION (fn),
5553 "ms_hook_prologue is not compatible with nested function");
5554 else
5555 return true;
5557 return false;
5560 static enum calling_abi
5561 ix86_function_abi (const_tree fndecl)
5563 if (! fndecl)
5564 return ix86_abi;
5565 return ix86_function_type_abi (TREE_TYPE (fndecl));
5568 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5569 call abi used. */
5570 enum calling_abi
5571 ix86_cfun_abi (void)
5573 if (! cfun)
5574 return ix86_abi;
5575 return cfun->machine->call_abi;
5578 /* Write the extra assembler code needed to declare a function properly. */
5580 void
5581 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5582 tree decl)
5584 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5586 if (is_ms_hook)
5588 int i, filler_count = (TARGET_64BIT ? 32 : 16);
5589 unsigned int filler_cc = 0xcccccccc;
5591 for (i = 0; i < filler_count; i += 4)
5592 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5595 #ifdef SUBTARGET_ASM_UNWIND_INIT
5596 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5597 #endif
5599 ASM_OUTPUT_LABEL (asm_out_file, fname);
5601 /* Output magic byte marker, if hot-patch attribute is set. */
5602 if (is_ms_hook)
5604 if (TARGET_64BIT)
5606 /* leaq [%rsp + 0], %rsp */
5607 asm_fprintf (asm_out_file, ASM_BYTE
5608 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5610 else
5612 /* movl.s %edi, %edi
5613 push %ebp
5614 movl.s %esp, %ebp */
5615 asm_fprintf (asm_out_file, ASM_BYTE
5616 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5621 /* regclass.c */
5622 extern void init_regs (void);
5624 /* Implementation of call abi switching target hook. Specific to FNDECL
5625 the specific call register sets are set. See also
5626 ix86_conditional_register_usage for more details. */
5627 void
5628 ix86_call_abi_override (const_tree fndecl)
5630 if (fndecl == NULL_TREE)
5631 cfun->machine->call_abi = ix86_abi;
5632 else
5633 cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
5636 /* 64-bit MS and SYSV ABI have different set of call used registers. Avoid
5637 expensive re-initialization of init_regs each time we switch function context
5638 since this is needed only during RTL expansion. */
5639 static void
5640 ix86_maybe_switch_abi (void)
5642 if (TARGET_64BIT &&
5643 call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
5644 reinit_regs ();
5647 /* Initialize a variable CUM of type CUMULATIVE_ARGS
5648 for a call to a function whose data type is FNTYPE.
5649 For a library call, FNTYPE is 0. */
5651 void
5652 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
5653 tree fntype, /* tree ptr for function decl */
5654 rtx libname, /* SYMBOL_REF of library name or 0 */
5655 tree fndecl,
5656 int caller)
5658 struct cgraph_local_info *i;
5660 memset (cum, 0, sizeof (*cum));
5662 if (fndecl)
5664 i = cgraph_local_info (fndecl);
5665 cum->call_abi = ix86_function_abi (fndecl);
5667 else
5669 i = NULL;
5670 cum->call_abi = ix86_function_type_abi (fntype);
5673 cum->caller = caller;
5675 /* Set up the number of registers to use for passing arguments. */
5677 if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
5678 sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
5679 "or subtarget optimization implying it");
5680 cum->nregs = ix86_regparm;
5681 if (TARGET_64BIT)
5683 cum->nregs = (cum->call_abi == SYSV_ABI
5684 ? X86_64_REGPARM_MAX
5685 : X86_64_MS_REGPARM_MAX);
5687 if (TARGET_SSE)
5689 cum->sse_nregs = SSE_REGPARM_MAX;
5690 if (TARGET_64BIT)
5692 cum->sse_nregs = (cum->call_abi == SYSV_ABI
5693 ? X86_64_SSE_REGPARM_MAX
5694 : X86_64_MS_SSE_REGPARM_MAX);
5697 if (TARGET_MMX)
5698 cum->mmx_nregs = MMX_REGPARM_MAX;
5699 cum->warn_avx = true;
5700 cum->warn_sse = true;
5701 cum->warn_mmx = true;
5703 /* Because type might mismatch in between caller and callee, we need to
5704 use actual type of function for local calls.
5705 FIXME: cgraph_analyze can be told to actually record if function uses
5706 va_start so for local functions maybe_vaarg can be made aggressive
5707 helping K&R code.
5708 FIXME: once typesytem is fixed, we won't need this code anymore. */
5709 if (i && i->local && i->can_change_signature)
5710 fntype = TREE_TYPE (fndecl);
5711 cum->maybe_vaarg = (fntype
5712 ? (!prototype_p (fntype) || stdarg_p (fntype))
5713 : !libname);
5715 if (!TARGET_64BIT)
5717 /* If there are variable arguments, then we won't pass anything
5718 in registers in 32-bit mode. */
5719 if (stdarg_p (fntype))
5721 cum->nregs = 0;
5722 cum->sse_nregs = 0;
5723 cum->mmx_nregs = 0;
5724 cum->warn_avx = 0;
5725 cum->warn_sse = 0;
5726 cum->warn_mmx = 0;
5727 return;
5730 /* Use ecx and edx registers if function has fastcall attribute,
5731 else look for regparm information. */
5732 if (fntype)
5734 unsigned int ccvt = ix86_get_callcvt (fntype);
5735 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5737 cum->nregs = 1;
5738 cum->fastcall = 1; /* Same first register as in fastcall. */
5740 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5742 cum->nregs = 2;
5743 cum->fastcall = 1;
5745 else
5746 cum->nregs = ix86_function_regparm (fntype, fndecl);
5749 /* Set up the number of SSE registers used for passing SFmode
5750 and DFmode arguments. Warn for mismatching ABI. */
5751 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
5755 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
5756 But in the case of vector types, it is some vector mode.
5758 When we have only some of our vector isa extensions enabled, then there
5759 are some modes for which vector_mode_supported_p is false. For these
5760 modes, the generic vector support in gcc will choose some non-vector mode
5761 in order to implement the type. By computing the natural mode, we'll
5762 select the proper ABI location for the operand and not depend on whatever
5763 the middle-end decides to do with these vector types.
5765 The midde-end can't deal with the vector types > 16 bytes. In this
5766 case, we return the original mode and warn ABI change if CUM isn't
5767 NULL. */
5769 static enum machine_mode
5770 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum)
5772 enum machine_mode mode = TYPE_MODE (type);
5774 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
5776 HOST_WIDE_INT size = int_size_in_bytes (type);
5777 if ((size == 8 || size == 16 || size == 32)
5778 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
5779 && TYPE_VECTOR_SUBPARTS (type) > 1)
5781 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
5783 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
5784 mode = MIN_MODE_VECTOR_FLOAT;
5785 else
5786 mode = MIN_MODE_VECTOR_INT;
5788 /* Get the mode which has this inner mode and number of units. */
5789 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
5790 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
5791 && GET_MODE_INNER (mode) == innermode)
5793 if (size == 32 && !TARGET_AVX)
5795 static bool warnedavx;
5797 if (cum
5798 && !warnedavx
5799 && cum->warn_avx)
5801 warnedavx = true;
5802 warning (0, "AVX vector argument without AVX "
5803 "enabled changes the ABI");
5805 return TYPE_MODE (type);
5807 else if ((size == 8 || size == 16) && !TARGET_SSE)
5809 static bool warnedsse;
5811 if (cum
5812 && !warnedsse
5813 && cum->warn_sse)
5815 warnedsse = true;
5816 warning (0, "SSE vector argument without SSE "
5817 "enabled changes the ABI");
5819 return mode;
5821 else
5822 return mode;
5825 gcc_unreachable ();
5829 return mode;
5832 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
5833 this may not agree with the mode that the type system has chosen for the
5834 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
5835 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
5837 static rtx
5838 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
5839 unsigned int regno)
5841 rtx tmp;
5843 if (orig_mode != BLKmode)
5844 tmp = gen_rtx_REG (orig_mode, regno);
5845 else
5847 tmp = gen_rtx_REG (mode, regno);
5848 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
5849 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
5852 return tmp;
5855 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
5856 of this code is to classify each 8bytes of incoming argument by the register
5857 class and assign registers accordingly. */
5859 /* Return the union class of CLASS1 and CLASS2.
5860 See the x86-64 PS ABI for details. */
5862 static enum x86_64_reg_class
5863 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
5865 /* Rule #1: If both classes are equal, this is the resulting class. */
5866 if (class1 == class2)
5867 return class1;
5869 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
5870 the other class. */
5871 if (class1 == X86_64_NO_CLASS)
5872 return class2;
5873 if (class2 == X86_64_NO_CLASS)
5874 return class1;
5876 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
5877 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
5878 return X86_64_MEMORY_CLASS;
5880 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
5881 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
5882 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
5883 return X86_64_INTEGERSI_CLASS;
5884 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
5885 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
5886 return X86_64_INTEGER_CLASS;
5888 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
5889 MEMORY is used. */
5890 if (class1 == X86_64_X87_CLASS
5891 || class1 == X86_64_X87UP_CLASS
5892 || class1 == X86_64_COMPLEX_X87_CLASS
5893 || class2 == X86_64_X87_CLASS
5894 || class2 == X86_64_X87UP_CLASS
5895 || class2 == X86_64_COMPLEX_X87_CLASS)
5896 return X86_64_MEMORY_CLASS;
5898 /* Rule #6: Otherwise class SSE is used. */
5899 return X86_64_SSE_CLASS;
5902 /* Classify the argument of type TYPE and mode MODE.
5903 CLASSES will be filled by the register class used to pass each word
5904 of the operand. The number of words is returned. In case the parameter
5905 should be passed in memory, 0 is returned. As a special case for zero
5906 sized containers, classes[0] will be NO_CLASS and 1 is returned.
5908 BIT_OFFSET is used internally for handling records and specifies offset
5909 of the offset in bits modulo 256 to avoid overflow cases.
5911 See the x86-64 PS ABI for details.
5914 static int
5915 classify_argument (enum machine_mode mode, const_tree type,
5916 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
5918 HOST_WIDE_INT bytes =
5919 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
5920 int words
5921 = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5923 /* Variable sized entities are always passed/returned in memory. */
5924 if (bytes < 0)
5925 return 0;
5927 if (mode != VOIDmode
5928 && targetm.calls.must_pass_in_stack (mode, type))
5929 return 0;
5931 /* Special case check for pointer to shared, on 64-bit target. */
5932 if (TARGET_64BIT && mode == TImode
5933 && type && TREE_CODE (type) == POINTER_TYPE
5934 && upc_shared_type_p (TREE_TYPE (type)))
5936 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
5937 return 2;
5940 if (type && AGGREGATE_TYPE_P (type))
5942 int i;
5943 tree field;
5944 enum x86_64_reg_class subclasses[MAX_CLASSES];
5946 /* On x86-64 we pass structures larger than 32 bytes on the stack. */
5947 if (bytes > 32)
5948 return 0;
5950 for (i = 0; i < words; i++)
5951 classes[i] = X86_64_NO_CLASS;
5953 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
5954 signalize memory class, so handle it as special case. */
5955 if (!words)
5957 classes[0] = X86_64_NO_CLASS;
5958 return 1;
5961 /* Classify each field of record and merge classes. */
5962 switch (TREE_CODE (type))
5964 case RECORD_TYPE:
5965 /* And now merge the fields of structure. */
5966 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5968 if (TREE_CODE (field) == FIELD_DECL)
5970 int num;
5972 if (TREE_TYPE (field) == error_mark_node)
5973 continue;
5975 /* Bitfields are always classified as integer. Handle them
5976 early, since later code would consider them to be
5977 misaligned integers. */
5978 if (DECL_BIT_FIELD (field))
5980 for (i = (int_bit_position (field)
5981 + (bit_offset % 64)) / 8 / 8;
5982 i < ((int_bit_position (field) + (bit_offset % 64))
5983 + tree_low_cst (DECL_SIZE (field), 0)
5984 + 63) / 8 / 8; i++)
5985 classes[i] =
5986 merge_classes (X86_64_INTEGER_CLASS,
5987 classes[i]);
5989 else
5991 int pos;
5993 type = TREE_TYPE (field);
5995 /* Flexible array member is ignored. */
5996 if (TYPE_MODE (type) == BLKmode
5997 && TREE_CODE (type) == ARRAY_TYPE
5998 && TYPE_SIZE (type) == NULL_TREE
5999 && TYPE_DOMAIN (type) != NULL_TREE
6000 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
6001 == NULL_TREE))
6003 static bool warned;
6005 if (!warned && warn_psabi)
6007 warned = true;
6008 inform (input_location,
6009 "the ABI of passing struct with"
6010 " a flexible array member has"
6011 " changed in GCC 4.4");
6013 continue;
6015 num = classify_argument (TYPE_MODE (type), type,
6016 subclasses,
6017 (int_bit_position (field)
6018 + bit_offset) % 256);
6019 if (!num)
6020 return 0;
6021 pos = (int_bit_position (field)
6022 + (bit_offset % 64)) / 8 / 8;
6023 for (i = 0; i < num && (i + pos) < words; i++)
6024 classes[i + pos] =
6025 merge_classes (subclasses[i], classes[i + pos]);
6029 break;
6031 case ARRAY_TYPE:
6032 /* Arrays are handled as small records. */
6034 int num;
6035 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
6036 TREE_TYPE (type), subclasses, bit_offset);
6037 if (!num)
6038 return 0;
6040 /* The partial classes are now full classes. */
6041 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
6042 subclasses[0] = X86_64_SSE_CLASS;
6043 if (subclasses[0] == X86_64_INTEGERSI_CLASS
6044 && !((bit_offset % 64) == 0 && bytes == 4))
6045 subclasses[0] = X86_64_INTEGER_CLASS;
6047 for (i = 0; i < words; i++)
6048 classes[i] = subclasses[i % num];
6050 break;
6052 case UNION_TYPE:
6053 case QUAL_UNION_TYPE:
6054 /* Unions are similar to RECORD_TYPE but offset is always 0.
6056 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6058 if (TREE_CODE (field) == FIELD_DECL)
6060 int num;
6062 if (TREE_TYPE (field) == error_mark_node)
6063 continue;
6065 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
6066 TREE_TYPE (field), subclasses,
6067 bit_offset);
6068 if (!num)
6069 return 0;
6070 for (i = 0; i < num; i++)
6071 classes[i] = merge_classes (subclasses[i], classes[i]);
6074 break;
6076 default:
6077 gcc_unreachable ();
6080 if (words > 2)
6082 /* When size > 16 bytes, if the first one isn't
6083 X86_64_SSE_CLASS or any other ones aren't
6084 X86_64_SSEUP_CLASS, everything should be passed in
6085 memory. */
6086 if (classes[0] != X86_64_SSE_CLASS)
6087 return 0;
6089 for (i = 1; i < words; i++)
6090 if (classes[i] != X86_64_SSEUP_CLASS)
6091 return 0;
6094 /* Final merger cleanup. */
6095 for (i = 0; i < words; i++)
6097 /* If one class is MEMORY, everything should be passed in
6098 memory. */
6099 if (classes[i] == X86_64_MEMORY_CLASS)
6100 return 0;
6102 /* The X86_64_SSEUP_CLASS should be always preceded by
6103 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
6104 if (classes[i] == X86_64_SSEUP_CLASS
6105 && classes[i - 1] != X86_64_SSE_CLASS
6106 && classes[i - 1] != X86_64_SSEUP_CLASS)
6108 /* The first one should never be X86_64_SSEUP_CLASS. */
6109 gcc_assert (i != 0);
6110 classes[i] = X86_64_SSE_CLASS;
6113 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
6114 everything should be passed in memory. */
6115 if (classes[i] == X86_64_X87UP_CLASS
6116 && (classes[i - 1] != X86_64_X87_CLASS))
6118 static bool warned;
6120 /* The first one should never be X86_64_X87UP_CLASS. */
6121 gcc_assert (i != 0);
6122 if (!warned && warn_psabi)
6124 warned = true;
6125 inform (input_location,
6126 "the ABI of passing union with long double"
6127 " has changed in GCC 4.4");
6129 return 0;
6132 return words;
6135 /* Compute alignment needed. We align all types to natural boundaries with
6136 exception of XFmode that is aligned to 64bits. */
6137 if (mode != VOIDmode && mode != BLKmode)
6139 int mode_alignment = GET_MODE_BITSIZE (mode);
6141 if (mode == XFmode)
6142 mode_alignment = 128;
6143 else if (mode == XCmode)
6144 mode_alignment = 256;
6145 if (COMPLEX_MODE_P (mode))
6146 mode_alignment /= 2;
6147 /* Misaligned fields are always returned in memory. */
6148 if (bit_offset % mode_alignment)
6149 return 0;
6152 /* for V1xx modes, just use the base mode */
6153 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
6154 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
6155 mode = GET_MODE_INNER (mode);
6157 /* Classification of atomic types. */
6158 switch (mode)
6160 case SDmode:
6161 case DDmode:
6162 classes[0] = X86_64_SSE_CLASS;
6163 return 1;
6164 case TDmode:
6165 classes[0] = X86_64_SSE_CLASS;
6166 classes[1] = X86_64_SSEUP_CLASS;
6167 return 2;
6168 case DImode:
6169 case SImode:
6170 case HImode:
6171 case QImode:
6172 case CSImode:
6173 case CHImode:
6174 case CQImode:
6176 int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
6178 if (size <= 32)
6180 classes[0] = X86_64_INTEGERSI_CLASS;
6181 return 1;
6183 else if (size <= 64)
6185 classes[0] = X86_64_INTEGER_CLASS;
6186 return 1;
6188 else if (size <= 64+32)
6190 classes[0] = X86_64_INTEGER_CLASS;
6191 classes[1] = X86_64_INTEGERSI_CLASS;
6192 return 2;
6194 else if (size <= 64+64)
6196 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6197 return 2;
6199 else
6200 gcc_unreachable ();
6202 case CDImode:
6203 case TImode:
6204 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6205 return 2;
6206 case COImode:
6207 case OImode:
6208 /* OImode shouldn't be used directly. */
6209 gcc_unreachable ();
6210 case CTImode:
6211 return 0;
6212 case SFmode:
6213 if (!(bit_offset % 64))
6214 classes[0] = X86_64_SSESF_CLASS;
6215 else
6216 classes[0] = X86_64_SSE_CLASS;
6217 return 1;
6218 case DFmode:
6219 classes[0] = X86_64_SSEDF_CLASS;
6220 return 1;
6221 case XFmode:
6222 classes[0] = X86_64_X87_CLASS;
6223 classes[1] = X86_64_X87UP_CLASS;
6224 return 2;
6225 case TFmode:
6226 classes[0] = X86_64_SSE_CLASS;
6227 classes[1] = X86_64_SSEUP_CLASS;
6228 return 2;
6229 case SCmode:
6230 classes[0] = X86_64_SSE_CLASS;
6231 if (!(bit_offset % 64))
6232 return 1;
6233 else
6235 static bool warned;
6237 if (!warned && warn_psabi)
6239 warned = true;
6240 inform (input_location,
6241 "the ABI of passing structure with complex float"
6242 " member has changed in GCC 4.4");
6244 classes[1] = X86_64_SSESF_CLASS;
6245 return 2;
6247 case DCmode:
6248 classes[0] = X86_64_SSEDF_CLASS;
6249 classes[1] = X86_64_SSEDF_CLASS;
6250 return 2;
6251 case XCmode:
6252 classes[0] = X86_64_COMPLEX_X87_CLASS;
6253 return 1;
6254 case TCmode:
6255 /* This modes is larger than 16 bytes. */
6256 return 0;
6257 case V8SFmode:
6258 case V8SImode:
6259 case V32QImode:
6260 case V16HImode:
6261 case V4DFmode:
6262 case V4DImode:
6263 classes[0] = X86_64_SSE_CLASS;
6264 classes[1] = X86_64_SSEUP_CLASS;
6265 classes[2] = X86_64_SSEUP_CLASS;
6266 classes[3] = X86_64_SSEUP_CLASS;
6267 return 4;
6268 case V4SFmode:
6269 case V4SImode:
6270 case V16QImode:
6271 case V8HImode:
6272 case V2DFmode:
6273 case V2DImode:
6274 classes[0] = X86_64_SSE_CLASS;
6275 classes[1] = X86_64_SSEUP_CLASS;
6276 return 2;
6277 case V1TImode:
6278 case V1DImode:
6279 case V2SFmode:
6280 case V2SImode:
6281 case V4HImode:
6282 case V8QImode:
6283 classes[0] = X86_64_SSE_CLASS;
6284 return 1;
6285 case BLKmode:
6286 case VOIDmode:
6287 return 0;
6288 default:
6289 gcc_assert (VECTOR_MODE_P (mode));
6291 if (bytes > 16)
6292 return 0;
6294 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6296 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6297 classes[0] = X86_64_INTEGERSI_CLASS;
6298 else
6299 classes[0] = X86_64_INTEGER_CLASS;
6300 classes[1] = X86_64_INTEGER_CLASS;
6301 return 1 + (bytes > 8);
6305 /* Examine the argument and return set number of register required in each
6306 class. Return 0 iff parameter should be passed in memory. */
6307 static int
6308 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6309 int *int_nregs, int *sse_nregs)
6311 enum x86_64_reg_class regclass[MAX_CLASSES];
6312 int n = classify_argument (mode, type, regclass, 0);
6314 *int_nregs = 0;
6315 *sse_nregs = 0;
6316 if (!n)
6317 return 0;
6318 for (n--; n >= 0; n--)
6319 switch (regclass[n])
6321 case X86_64_INTEGER_CLASS:
6322 case X86_64_INTEGERSI_CLASS:
6323 (*int_nregs)++;
6324 break;
6325 case X86_64_SSE_CLASS:
6326 case X86_64_SSESF_CLASS:
6327 case X86_64_SSEDF_CLASS:
6328 (*sse_nregs)++;
6329 break;
6330 case X86_64_NO_CLASS:
6331 case X86_64_SSEUP_CLASS:
6332 break;
6333 case X86_64_X87_CLASS:
6334 case X86_64_X87UP_CLASS:
6335 if (!in_return)
6336 return 0;
6337 break;
6338 case X86_64_COMPLEX_X87_CLASS:
6339 return in_return ? 2 : 0;
6340 case X86_64_MEMORY_CLASS:
6341 gcc_unreachable ();
6343 return 1;
6346 /* Construct container for the argument used by GCC interface. See
6347 FUNCTION_ARG for the detailed description. */
6349 static rtx
6350 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6351 const_tree type, int in_return, int nintregs, int nsseregs,
6352 const int *intreg, int sse_regno)
6354 /* The following variables hold the static issued_error state. */
6355 static bool issued_sse_arg_error;
6356 static bool issued_sse_ret_error;
6357 static bool issued_x87_ret_error;
6359 enum machine_mode tmpmode;
6360 int bytes =
6361 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6362 enum x86_64_reg_class regclass[MAX_CLASSES];
6363 int n;
6364 int i;
6365 int nexps = 0;
6366 int needed_sseregs, needed_intregs;
6367 rtx exp[MAX_CLASSES];
6368 rtx ret;
6370 n = classify_argument (mode, type, regclass, 0);
6371 if (!n)
6372 return NULL;
6373 if (!examine_argument (mode, type, in_return, &needed_intregs,
6374 &needed_sseregs))
6375 return NULL;
6376 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6377 return NULL;
6379 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
6380 some less clueful developer tries to use floating-point anyway. */
6381 if (needed_sseregs && !TARGET_SSE)
6383 if (in_return)
6385 if (!issued_sse_ret_error)
6387 error ("SSE register return with SSE disabled");
6388 issued_sse_ret_error = true;
6391 else if (!issued_sse_arg_error)
6393 error ("SSE register argument with SSE disabled");
6394 issued_sse_arg_error = true;
6396 return NULL;
6399 /* Likewise, error if the ABI requires us to return values in the
6400 x87 registers and the user specified -mno-80387. */
6401 if (!TARGET_80387 && in_return)
6402 for (i = 0; i < n; i++)
6403 if (regclass[i] == X86_64_X87_CLASS
6404 || regclass[i] == X86_64_X87UP_CLASS
6405 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6407 if (!issued_x87_ret_error)
6409 error ("x87 register return with x87 disabled");
6410 issued_x87_ret_error = true;
6412 return NULL;
6415 /* First construct simple cases. Avoid SCmode, since we want to use
6416 single register to pass this type. */
6417 if (n == 1 && mode != SCmode)
6418 switch (regclass[0])
6420 case X86_64_INTEGER_CLASS:
6421 case X86_64_INTEGERSI_CLASS:
6422 return gen_rtx_REG (mode, intreg[0]);
6423 case X86_64_SSE_CLASS:
6424 case X86_64_SSESF_CLASS:
6425 case X86_64_SSEDF_CLASS:
6426 if (mode != BLKmode)
6427 return gen_reg_or_parallel (mode, orig_mode,
6428 SSE_REGNO (sse_regno));
6429 break;
6430 case X86_64_X87_CLASS:
6431 case X86_64_COMPLEX_X87_CLASS:
6432 return gen_rtx_REG (mode, FIRST_STACK_REG);
6433 case X86_64_NO_CLASS:
6434 /* Zero sized array, struct or class. */
6435 return NULL;
6436 default:
6437 gcc_unreachable ();
6439 if (n == 2
6440 && regclass[0] == X86_64_SSE_CLASS
6441 && regclass[1] == X86_64_SSEUP_CLASS
6442 && mode != BLKmode)
6443 return gen_reg_or_parallel (mode, orig_mode,
6444 SSE_REGNO (sse_regno));
6445 if (n == 4
6446 && regclass[0] == X86_64_SSE_CLASS
6447 && regclass[1] == X86_64_SSEUP_CLASS
6448 && regclass[2] == X86_64_SSEUP_CLASS
6449 && regclass[3] == X86_64_SSEUP_CLASS
6450 && mode != BLKmode)
6451 return gen_reg_or_parallel (mode, orig_mode,
6452 SSE_REGNO (sse_regno));
6453 if (n == 2
6454 && regclass[0] == X86_64_X87_CLASS
6455 && regclass[1] == X86_64_X87UP_CLASS)
6456 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6458 if (n == 2
6459 && regclass[0] == X86_64_INTEGER_CLASS
6460 && regclass[1] == X86_64_INTEGER_CLASS
6461 && (mode == CDImode || mode == TImode || mode == TFmode)
6462 && intreg[0] + 1 == intreg[1])
6463 return gen_rtx_REG (mode, intreg[0]);
6465 /* Otherwise figure out the entries of the PARALLEL. */
6466 for (i = 0; i < n; i++)
6468 int pos;
6470 switch (regclass[i])
6472 case X86_64_NO_CLASS:
6473 break;
6474 case X86_64_INTEGER_CLASS:
6475 case X86_64_INTEGERSI_CLASS:
6476 /* Merge TImodes on aligned occasions here too. */
6477 if (i * 8 + 8 > bytes)
6478 tmpmode
6479 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6480 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6481 tmpmode = SImode;
6482 else
6483 tmpmode = DImode;
6484 /* We've requested 24 bytes we
6485 don't have mode for. Use DImode. */
6486 if (tmpmode == BLKmode)
6487 tmpmode = DImode;
6488 exp [nexps++]
6489 = gen_rtx_EXPR_LIST (VOIDmode,
6490 gen_rtx_REG (tmpmode, *intreg),
6491 GEN_INT (i*8));
6492 intreg++;
6493 break;
6494 case X86_64_SSESF_CLASS:
6495 exp [nexps++]
6496 = gen_rtx_EXPR_LIST (VOIDmode,
6497 gen_rtx_REG (SFmode,
6498 SSE_REGNO (sse_regno)),
6499 GEN_INT (i*8));
6500 sse_regno++;
6501 break;
6502 case X86_64_SSEDF_CLASS:
6503 exp [nexps++]
6504 = gen_rtx_EXPR_LIST (VOIDmode,
6505 gen_rtx_REG (DFmode,
6506 SSE_REGNO (sse_regno)),
6507 GEN_INT (i*8));
6508 sse_regno++;
6509 break;
6510 case X86_64_SSE_CLASS:
6511 pos = i;
6512 switch (n)
6514 case 1:
6515 tmpmode = DImode;
6516 break;
6517 case 2:
6518 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6520 tmpmode = TImode;
6521 i++;
6523 else
6524 tmpmode = DImode;
6525 break;
6526 case 4:
6527 gcc_assert (i == 0
6528 && regclass[1] == X86_64_SSEUP_CLASS
6529 && regclass[2] == X86_64_SSEUP_CLASS
6530 && regclass[3] == X86_64_SSEUP_CLASS);
6531 tmpmode = OImode;
6532 i += 3;
6533 break;
6534 default:
6535 gcc_unreachable ();
6537 exp [nexps++]
6538 = gen_rtx_EXPR_LIST (VOIDmode,
6539 gen_rtx_REG (tmpmode,
6540 SSE_REGNO (sse_regno)),
6541 GEN_INT (pos*8));
6542 sse_regno++;
6543 break;
6544 default:
6545 gcc_unreachable ();
6549 /* Empty aligned struct, union or class. */
6550 if (nexps == 0)
6551 return NULL;
6553 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6554 for (i = 0; i < nexps; i++)
6555 XVECEXP (ret, 0, i) = exp [i];
6556 return ret;
6559 /* Update the data in CUM to advance over an argument of mode MODE
6560 and data type TYPE. (TYPE is null for libcalls where that information
6561 may not be available.) */
6563 static void
6564 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6565 const_tree type, HOST_WIDE_INT bytes,
6566 HOST_WIDE_INT words)
6568 switch (mode)
6570 default:
6571 break;
6573 case BLKmode:
6574 if (bytes < 0)
6575 break;
6576 /* FALLTHRU */
6578 case DImode:
6579 case SImode:
6580 case HImode:
6581 case QImode:
6582 cum->words += words;
6583 cum->nregs -= words;
6584 cum->regno += words;
6586 if (cum->nregs <= 0)
6588 cum->nregs = 0;
6589 cum->regno = 0;
6591 break;
6593 case OImode:
6594 /* OImode shouldn't be used directly. */
6595 gcc_unreachable ();
6597 case DFmode:
6598 if (cum->float_in_sse < 2)
6599 break;
6600 case SFmode:
6601 if (cum->float_in_sse < 1)
6602 break;
6603 /* FALLTHRU */
6605 case V8SFmode:
6606 case V8SImode:
6607 case V32QImode:
6608 case V16HImode:
6609 case V4DFmode:
6610 case V4DImode:
6611 case TImode:
6612 case V16QImode:
6613 case V8HImode:
6614 case V4SImode:
6615 case V2DImode:
6616 case V4SFmode:
6617 case V2DFmode:
6618 if (!type || !AGGREGATE_TYPE_P (type))
6620 cum->sse_words += words;
6621 cum->sse_nregs -= 1;
6622 cum->sse_regno += 1;
6623 if (cum->sse_nregs <= 0)
6625 cum->sse_nregs = 0;
6626 cum->sse_regno = 0;
6629 break;
6631 case V8QImode:
6632 case V4HImode:
6633 case V2SImode:
6634 case V2SFmode:
6635 case V1TImode:
6636 case V1DImode:
6637 if (!type || !AGGREGATE_TYPE_P (type))
6639 cum->mmx_words += words;
6640 cum->mmx_nregs -= 1;
6641 cum->mmx_regno += 1;
6642 if (cum->mmx_nregs <= 0)
6644 cum->mmx_nregs = 0;
6645 cum->mmx_regno = 0;
6648 break;
6652 static void
6653 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6654 const_tree type, HOST_WIDE_INT words, bool named)
6656 int int_nregs, sse_nregs;
6658 /* Unnamed 256bit vector mode parameters are passed on stack. */
6659 if (!named && VALID_AVX256_REG_MODE (mode))
6660 return;
6662 if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
6663 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
6665 cum->nregs -= int_nregs;
6666 cum->sse_nregs -= sse_nregs;
6667 cum->regno += int_nregs;
6668 cum->sse_regno += sse_nregs;
6670 else
6672 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
6673 cum->words = (cum->words + align - 1) & ~(align - 1);
6674 cum->words += words;
6678 static void
6679 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
6680 HOST_WIDE_INT words)
6682 /* Otherwise, this should be passed indirect. */
6683 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
6685 cum->words += words;
6686 if (cum->nregs > 0)
6688 cum->nregs -= 1;
6689 cum->regno += 1;
6693 /* Update the data in CUM to advance over an argument of mode MODE and
6694 data type TYPE. (TYPE is null for libcalls where that information
6695 may not be available.) */
6697 static void
6698 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
6699 const_tree type, bool named)
6701 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6702 HOST_WIDE_INT bytes, words;
6704 if (mode == BLKmode)
6705 bytes = int_size_in_bytes (type);
6706 else
6707 bytes = GET_MODE_SIZE (mode);
6708 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6710 if (type)
6711 mode = type_natural_mode (type, NULL);
6713 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6714 function_arg_advance_ms_64 (cum, bytes, words);
6715 else if (TARGET_64BIT)
6716 function_arg_advance_64 (cum, mode, type, words, named);
6717 else
6718 function_arg_advance_32 (cum, mode, type, bytes, words);
6721 /* Define where to put the arguments to a function.
6722 Value is zero to push the argument on the stack,
6723 or a hard register in which to store the argument.
6725 MODE is the argument's machine mode.
6726 TYPE is the data type of the argument (as a tree).
6727 This is null for libcalls where that information may
6728 not be available.
6729 CUM is a variable of type CUMULATIVE_ARGS which gives info about
6730 the preceding args and about the function being called.
6731 NAMED is nonzero if this argument is a named parameter
6732 (otherwise it is an extra parameter matching an ellipsis). */
6734 static rtx
6735 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6736 enum machine_mode orig_mode, const_tree type,
6737 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
6739 static bool warnedsse, warnedmmx;
6741 /* Avoid the AL settings for the Unix64 ABI. */
6742 if (mode == VOIDmode)
6743 return constm1_rtx;
6745 switch (mode)
6747 default:
6748 break;
6750 case BLKmode:
6751 if (bytes < 0)
6752 break;
6753 /* FALLTHRU */
6754 case DImode:
6755 case SImode:
6756 case HImode:
6757 case QImode:
6758 if (words <= cum->nregs)
6760 int regno = cum->regno;
6762 /* Fastcall allocates the first two DWORD (SImode) or
6763 smaller arguments to ECX and EDX if it isn't an
6764 aggregate type . */
6765 if (cum->fastcall)
6767 if (mode == BLKmode
6768 || mode == DImode
6769 || (type && AGGREGATE_TYPE_P (type)))
6770 break;
6772 /* ECX not EAX is the first allocated register. */
6773 if (regno == AX_REG)
6774 regno = CX_REG;
6776 return gen_rtx_REG (mode, regno);
6778 break;
6780 case DFmode:
6781 if (cum->float_in_sse < 2)
6782 break;
6783 case SFmode:
6784 if (cum->float_in_sse < 1)
6785 break;
6786 /* FALLTHRU */
6787 case TImode:
6788 /* In 32bit, we pass TImode in xmm registers. */
6789 case V16QImode:
6790 case V8HImode:
6791 case V4SImode:
6792 case V2DImode:
6793 case V4SFmode:
6794 case V2DFmode:
6795 if (!type || !AGGREGATE_TYPE_P (type))
6797 if (!TARGET_SSE && !warnedsse && cum->warn_sse)
6799 warnedsse = true;
6800 warning (0, "SSE vector argument without SSE enabled "
6801 "changes the ABI");
6803 if (cum->sse_nregs)
6804 return gen_reg_or_parallel (mode, orig_mode,
6805 cum->sse_regno + FIRST_SSE_REG);
6807 break;
6809 case OImode:
6810 /* OImode shouldn't be used directly. */
6811 gcc_unreachable ();
6813 case V8SFmode:
6814 case V8SImode:
6815 case V32QImode:
6816 case V16HImode:
6817 case V4DFmode:
6818 case V4DImode:
6819 if (!type || !AGGREGATE_TYPE_P (type))
6821 if (cum->sse_nregs)
6822 return gen_reg_or_parallel (mode, orig_mode,
6823 cum->sse_regno + FIRST_SSE_REG);
6825 break;
6827 case V8QImode:
6828 case V4HImode:
6829 case V2SImode:
6830 case V2SFmode:
6831 case V1TImode:
6832 case V1DImode:
6833 if (!type || !AGGREGATE_TYPE_P (type))
6835 if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
6837 warnedmmx = true;
6838 warning (0, "MMX vector argument without MMX enabled "
6839 "changes the ABI");
6841 if (cum->mmx_nregs)
6842 return gen_reg_or_parallel (mode, orig_mode,
6843 cum->mmx_regno + FIRST_MMX_REG);
6845 break;
6848 return NULL_RTX;
6851 static rtx
6852 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6853 enum machine_mode orig_mode, const_tree type, bool named)
6855 /* Handle a hidden AL argument containing number of registers
6856 for varargs x86-64 functions. */
6857 if (mode == VOIDmode)
6858 return GEN_INT (cum->maybe_vaarg
6859 ? (cum->sse_nregs < 0
6860 ? X86_64_SSE_REGPARM_MAX
6861 : cum->sse_regno)
6862 : -1);
6864 switch (mode)
6866 default:
6867 break;
6869 case V8SFmode:
6870 case V8SImode:
6871 case V32QImode:
6872 case V16HImode:
6873 case V4DFmode:
6874 case V4DImode:
6875 /* Unnamed 256bit vector mode parameters are passed on stack. */
6876 if (!named)
6877 return NULL;
6878 break;
6881 return construct_container (mode, orig_mode, type, 0, cum->nregs,
6882 cum->sse_nregs,
6883 &x86_64_int_parameter_registers [cum->regno],
6884 cum->sse_regno);
6887 static rtx
6888 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6889 enum machine_mode orig_mode, bool named,
6890 HOST_WIDE_INT bytes)
6892 unsigned int regno;
6894 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
6895 We use value of -2 to specify that current function call is MSABI. */
6896 if (mode == VOIDmode)
6897 return GEN_INT (-2);
6899 /* If we've run out of registers, it goes on the stack. */
6900 if (cum->nregs == 0)
6901 return NULL_RTX;
6903 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
6905 /* Only floating point modes are passed in anything but integer regs. */
6906 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
6908 if (named)
6909 regno = cum->regno + FIRST_SSE_REG;
6910 else
6912 rtx t1, t2;
6914 /* Unnamed floating parameters are passed in both the
6915 SSE and integer registers. */
6916 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
6917 t2 = gen_rtx_REG (mode, regno);
6918 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
6919 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
6920 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
6923 /* Handle aggregated types passed in register. */
6924 if (orig_mode == BLKmode)
6926 if (bytes > 0 && bytes <= 8)
6927 mode = (bytes > 4 ? DImode : SImode);
6928 if (mode == BLKmode)
6929 mode = DImode;
6932 return gen_reg_or_parallel (mode, orig_mode, regno);
6935 /* Return where to put the arguments to a function.
6936 Return zero to push the argument on the stack, or a hard register in which to store the argument.
6938 MODE is the argument's machine mode. TYPE is the data type of the
6939 argument. It is null for libcalls where that information may not be
6940 available. CUM gives information about the preceding args and about
6941 the function being called. NAMED is nonzero if this argument is a
6942 named parameter (otherwise it is an extra parameter matching an
6943 ellipsis). */
6945 static rtx
6946 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
6947 const_tree type, bool named)
6949 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6950 enum machine_mode mode = omode;
6951 HOST_WIDE_INT bytes, words;
6952 rtx arg;
6954 if (mode == BLKmode)
6955 bytes = int_size_in_bytes (type);
6956 else
6957 bytes = GET_MODE_SIZE (mode);
6958 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6960 /* To simplify the code below, represent vector types with a vector mode
6961 even if MMX/SSE are not active. */
6962 if (type && TREE_CODE (type) == VECTOR_TYPE)
6963 mode = type_natural_mode (type, cum);
6965 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6966 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
6967 else if (TARGET_64BIT)
6968 arg = function_arg_64 (cum, mode, omode, type, named);
6969 else
6970 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
6972 return arg;
6975 /* A C expression that indicates when an argument must be passed by
6976 reference. If nonzero for an argument, a copy of that argument is
6977 made in memory and a pointer to the argument is passed instead of
6978 the argument itself. The pointer is passed in whatever way is
6979 appropriate for passing a pointer to that type. */
6981 static bool
6982 ix86_pass_by_reference (cumulative_args_t cum_v ATTRIBUTE_UNUSED,
6983 enum machine_mode mode ATTRIBUTE_UNUSED,
6984 const_tree type, bool named ATTRIBUTE_UNUSED)
6986 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6988 /* See Windows x64 Software Convention. */
6989 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6991 int msize = (int) GET_MODE_SIZE (mode);
6992 if (type)
6994 /* Arrays are passed by reference. */
6995 if (TREE_CODE (type) == ARRAY_TYPE)
6996 return true;
6998 if (AGGREGATE_TYPE_P (type))
7000 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
7001 are passed by reference. */
7002 msize = int_size_in_bytes (type);
7006 /* __m128 is passed by reference. */
7007 switch (msize) {
7008 case 1: case 2: case 4: case 8:
7009 break;
7010 default:
7011 return true;
7014 else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
7015 return 1;
7017 return 0;
7020 /* Return true when TYPE should be 128bit aligned for 32bit argument
7021 passing ABI. XXX: This function is obsolete and is only used for
7022 checking psABI compatibility with previous versions of GCC. */
7024 static bool
7025 ix86_compat_aligned_value_p (const_tree type)
7027 enum machine_mode mode = TYPE_MODE (type);
7028 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
7029 || mode == TDmode
7030 || mode == TFmode
7031 || mode == TCmode)
7032 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
7033 return true;
7034 if (TYPE_ALIGN (type) < 128)
7035 return false;
7037 if (AGGREGATE_TYPE_P (type))
7039 /* Walk the aggregates recursively. */
7040 switch (TREE_CODE (type))
7042 case RECORD_TYPE:
7043 case UNION_TYPE:
7044 case QUAL_UNION_TYPE:
7046 tree field;
7048 /* Walk all the structure fields. */
7049 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7051 if (TREE_CODE (field) == FIELD_DECL
7052 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
7053 return true;
7055 break;
7058 case ARRAY_TYPE:
7059 /* Just for use if some languages passes arrays by value. */
7060 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
7061 return true;
7062 break;
7064 default:
7065 gcc_unreachable ();
7068 return false;
7071 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
7072 XXX: This function is obsolete and is only used for checking psABI
7073 compatibility with previous versions of GCC. */
7075 static unsigned int
7076 ix86_compat_function_arg_boundary (enum machine_mode mode,
7077 const_tree type, unsigned int align)
7079 /* In 32bit, only _Decimal128 and __float128 are aligned to their
7080 natural boundaries. */
7081 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
7083 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
7084 make an exception for SSE modes since these require 128bit
7085 alignment.
7087 The handling here differs from field_alignment. ICC aligns MMX
7088 arguments to 4 byte boundaries, while structure fields are aligned
7089 to 8 byte boundaries. */
7090 if (!type)
7092 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
7093 align = PARM_BOUNDARY;
7095 else
7097 if (!ix86_compat_aligned_value_p (type))
7098 align = PARM_BOUNDARY;
7101 if (align > BIGGEST_ALIGNMENT)
7102 align = BIGGEST_ALIGNMENT;
7103 return align;
7106 /* Return true when TYPE should be 128bit aligned for 32bit argument
7107 passing ABI. */
7109 static bool
7110 ix86_contains_aligned_value_p (const_tree type)
7112 enum machine_mode mode = TYPE_MODE (type);
7114 if (mode == XFmode || mode == XCmode)
7115 return false;
7117 if (TYPE_ALIGN (type) < 128)
7118 return false;
7120 if (AGGREGATE_TYPE_P (type))
7122 /* Walk the aggregates recursively. */
7123 switch (TREE_CODE (type))
7125 case RECORD_TYPE:
7126 case UNION_TYPE:
7127 case QUAL_UNION_TYPE:
7129 tree field;
7131 /* Walk all the structure fields. */
7132 for (field = TYPE_FIELDS (type);
7133 field;
7134 field = DECL_CHAIN (field))
7136 if (TREE_CODE (field) == FIELD_DECL
7137 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
7138 return true;
7140 break;
7143 case ARRAY_TYPE:
7144 /* Just for use if some languages passes arrays by value. */
7145 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
7146 return true;
7147 break;
7149 default:
7150 gcc_unreachable ();
7153 else
7154 return TYPE_ALIGN (type) >= 128;
7156 return false;
7159 /* Gives the alignment boundary, in bits, of an argument with the
7160 specified mode and type. */
7162 static unsigned int
7163 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
7165 unsigned int align;
7166 if (type)
7168 /* Since the main variant type is used for call, we convert it to
7169 the main variant type. */
7170 type = TYPE_MAIN_VARIANT (type);
7171 align = TYPE_ALIGN (type);
7173 else
7174 align = GET_MODE_ALIGNMENT (mode);
7175 if (align < PARM_BOUNDARY)
7176 align = PARM_BOUNDARY;
7177 else
7179 static bool warned;
7180 unsigned int saved_align = align;
7182 if (!TARGET_64BIT)
7184 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
7185 if (!type)
7187 if (mode == XFmode || mode == XCmode)
7188 align = PARM_BOUNDARY;
7190 else if (!ix86_contains_aligned_value_p (type))
7191 align = PARM_BOUNDARY;
7193 if (align < 128)
7194 align = PARM_BOUNDARY;
7197 if (warn_psabi
7198 && !warned
7199 && align != ix86_compat_function_arg_boundary (mode, type,
7200 saved_align))
7202 warned = true;
7203 inform (input_location,
7204 "The ABI for passing parameters with %d-byte"
7205 " alignment has changed in GCC 4.6",
7206 align / BITS_PER_UNIT);
7210 return align;
7213 /* Return true if N is a possible register number of function value. */
7215 static bool
7216 ix86_function_value_regno_p (const unsigned int regno)
7218 switch (regno)
7220 case AX_REG:
7221 return true;
7223 case FIRST_FLOAT_REG:
7224 /* TODO: The function should depend on current function ABI but
7225 builtins.c would need updating then. Therefore we use the
7226 default ABI. */
7227 if (TARGET_64BIT && ix86_abi == MS_ABI)
7228 return false;
7229 return TARGET_FLOAT_RETURNS_IN_80387;
7231 case FIRST_SSE_REG:
7232 return TARGET_SSE;
7234 case FIRST_MMX_REG:
7235 if (TARGET_MACHO || TARGET_64BIT)
7236 return false;
7237 return TARGET_MMX;
7240 return false;
7243 /* Define how to find the value returned by a function.
7244 VALTYPE is the data type of the value (as a tree).
7245 If the precise function being called is known, FUNC is its FUNCTION_DECL;
7246 otherwise, FUNC is 0. */
7248 static rtx
7249 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7250 const_tree fntype, const_tree fn)
7252 unsigned int regno;
7254 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7255 we normally prevent this case when mmx is not available. However
7256 some ABIs may require the result to be returned like DImode. */
7257 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7258 regno = FIRST_MMX_REG;
7260 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
7261 we prevent this case when sse is not available. However some ABIs
7262 may require the result to be returned like integer TImode. */
7263 else if (mode == TImode
7264 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7265 regno = FIRST_SSE_REG;
7267 /* 32-byte vector modes in %ymm0. */
7268 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7269 regno = FIRST_SSE_REG;
7271 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
7272 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7273 regno = FIRST_FLOAT_REG;
7274 else
7275 /* Most things go in %eax. */
7276 regno = AX_REG;
7278 /* Override FP return register with %xmm0 for local functions when
7279 SSE math is enabled or for functions with sseregparm attribute. */
7280 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7282 int sse_level = ix86_function_sseregparm (fntype, fn, false);
7283 if ((sse_level >= 1 && mode == SFmode)
7284 || (sse_level == 2 && mode == DFmode))
7285 regno = FIRST_SSE_REG;
7288 /* OImode shouldn't be used directly. */
7289 gcc_assert (mode != OImode);
7291 return gen_rtx_REG (orig_mode, regno);
7294 static rtx
7295 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7296 const_tree valtype)
7298 rtx ret;
7300 /* Handle libcalls, which don't provide a type node. */
7301 if (valtype == NULL)
7303 unsigned int regno;
7305 switch (mode)
7307 case SFmode:
7308 case SCmode:
7309 case DFmode:
7310 case DCmode:
7311 case TFmode:
7312 case SDmode:
7313 case DDmode:
7314 case TDmode:
7315 regno = FIRST_SSE_REG;
7316 break;
7317 case XFmode:
7318 case XCmode:
7319 regno = FIRST_FLOAT_REG;
7320 break;
7321 case TCmode:
7322 return NULL;
7323 default:
7324 regno = AX_REG;
7327 return gen_rtx_REG (mode, regno);
7329 else if (POINTER_TYPE_P (valtype)
7330 && !upc_shared_type_p (TREE_TYPE (valtype)))
7332 /* Pointers are always returned in word_mode. */
7333 mode = word_mode;
7336 ret = construct_container (mode, orig_mode, valtype, 1,
7337 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7338 x86_64_int_return_registers, 0);
7340 /* For zero sized structures, construct_container returns NULL, but we
7341 need to keep rest of compiler happy by returning meaningful value. */
7342 if (!ret)
7343 ret = gen_rtx_REG (orig_mode, AX_REG);
7345 return ret;
7348 static rtx
7349 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode)
7351 unsigned int regno = AX_REG;
7353 if (TARGET_SSE)
7355 switch (GET_MODE_SIZE (mode))
7357 case 16:
7358 if((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7359 && !COMPLEX_MODE_P (mode))
7360 regno = FIRST_SSE_REG;
7361 break;
7362 case 8:
7363 case 4:
7364 if (mode == SFmode || mode == DFmode)
7365 regno = FIRST_SSE_REG;
7366 break;
7367 default:
7368 break;
7371 return gen_rtx_REG (orig_mode, regno);
7374 static rtx
7375 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7376 enum machine_mode orig_mode, enum machine_mode mode)
7378 const_tree fn, fntype;
7380 fn = NULL_TREE;
7381 if (fntype_or_decl && DECL_P (fntype_or_decl))
7382 fn = fntype_or_decl;
7383 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7385 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7386 return function_value_ms_64 (orig_mode, mode);
7387 else if (TARGET_64BIT)
7388 return function_value_64 (orig_mode, mode, valtype);
7389 else
7390 return function_value_32 (orig_mode, mode, fntype, fn);
7393 static rtx
7394 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7395 bool outgoing ATTRIBUTE_UNUSED)
7397 enum machine_mode mode, orig_mode;
7399 orig_mode = TYPE_MODE (valtype);
7400 mode = type_natural_mode (valtype, NULL);
7401 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7404 /* Pointer function arguments and return values are promoted to
7405 word_mode. */
7407 static enum machine_mode
7408 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7409 int *punsignedp, const_tree fntype,
7410 int for_return)
7412 if (type != NULL_TREE && POINTER_TYPE_P (type))
7414 if (upc_shared_type_p (TREE_TYPE (type)))
7416 *punsignedp = 1;
7417 return TYPE_MODE (upc_pts_rep_type_node);
7419 *punsignedp = POINTERS_EXTEND_UNSIGNED;
7420 return word_mode;
7422 return default_promote_function_mode (type, mode, punsignedp, fntype,
7423 for_return);
7426 /* Return true if a structure, union or array with MODE containing FIELD
7427 should be accessed using BLKmode. */
7429 static bool
7430 ix86_member_type_forces_blk (const_tree field, enum machine_mode mode)
7432 /* Union with XFmode must be in BLKmode. */
7433 return (mode == XFmode
7434 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
7435 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
7439 ix86_libcall_value (enum machine_mode mode)
7441 return ix86_function_value_1 (NULL, NULL, mode, mode);
7444 /* Return true iff type is returned in memory. */
7446 static bool ATTRIBUTE_UNUSED
7447 return_in_memory_32 (const_tree type, enum machine_mode mode)
7449 HOST_WIDE_INT size;
7451 if (mode == BLKmode)
7452 return true;
7454 size = int_size_in_bytes (type);
7456 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7457 return false;
7459 if (VECTOR_MODE_P (mode) || mode == TImode)
7461 /* User-created vectors small enough to fit in EAX. */
7462 if (size < 8)
7463 return false;
7465 /* MMX/3dNow values are returned in MM0,
7466 except when it doesn't exits or the ABI prescribes otherwise. */
7467 if (size == 8)
7468 return !TARGET_MMX || TARGET_VECT8_RETURNS;
7470 /* SSE values are returned in XMM0, except when it doesn't exist. */
7471 if (size == 16)
7472 return !TARGET_SSE;
7474 /* AVX values are returned in YMM0, except when it doesn't exist. */
7475 if (size == 32)
7476 return !TARGET_AVX;
7479 if (mode == XFmode)
7480 return false;
7482 if (size > 12)
7483 return true;
7485 /* OImode shouldn't be used directly. */
7486 gcc_assert (mode != OImode);
7488 return false;
7491 static bool ATTRIBUTE_UNUSED
7492 return_in_memory_64 (const_tree type, enum machine_mode mode)
7494 int needed_intregs, needed_sseregs;
7495 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7498 static bool ATTRIBUTE_UNUSED
7499 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7501 HOST_WIDE_INT size = int_size_in_bytes (type);
7503 /* __m128 is returned in xmm0. */
7504 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7505 && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7506 return false;
7508 /* Otherwise, the size must be exactly in [1248]. */
7509 return size != 1 && size != 2 && size != 4 && size != 8;
7512 static bool
7513 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7515 #ifdef SUBTARGET_RETURN_IN_MEMORY
7516 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7517 #else
7518 const enum machine_mode mode = type_natural_mode (type, NULL);
7520 if (TARGET_64BIT)
7522 if (ix86_function_type_abi (fntype) == MS_ABI)
7523 return return_in_memory_ms_64 (type, mode);
7524 else
7525 return return_in_memory_64 (type, mode);
7527 else
7528 return return_in_memory_32 (type, mode);
7529 #endif
7532 /* When returning SSE vector types, we have a choice of either
7533 (1) being abi incompatible with a -march switch, or
7534 (2) generating an error.
7535 Given no good solution, I think the safest thing is one warning.
7536 The user won't be able to use -Werror, but....
7538 Choose the STRUCT_VALUE_RTX hook because that's (at present) only
7539 called in response to actually generating a caller or callee that
7540 uses such a type. As opposed to TARGET_RETURN_IN_MEMORY, which is called
7541 via aggregate_value_p for general type probing from tree-ssa. */
7543 static rtx
7544 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
7546 static bool warnedsse, warnedmmx;
7548 if (!TARGET_64BIT && type)
7550 /* Look at the return type of the function, not the function type. */
7551 enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
7553 if (!TARGET_SSE && !warnedsse)
7555 if (mode == TImode
7556 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7558 warnedsse = true;
7559 warning (0, "SSE vector return without SSE enabled "
7560 "changes the ABI");
7564 if (!TARGET_MMX && !warnedmmx)
7566 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7568 warnedmmx = true;
7569 warning (0, "MMX vector return without MMX enabled "
7570 "changes the ABI");
7575 return NULL;
7579 /* Create the va_list data type. */
7581 /* Returns the calling convention specific va_list date type.
7582 The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */
7584 static tree
7585 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7587 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7589 /* For i386 we use plain pointer to argument area. */
7590 if (!TARGET_64BIT || abi == MS_ABI)
7591 return build_pointer_type (char_type_node);
7593 record = lang_hooks.types.make_type (RECORD_TYPE);
7594 type_decl = build_decl (BUILTINS_LOCATION,
7595 TYPE_DECL, get_identifier ("__va_list_tag"), record);
7597 f_gpr = build_decl (BUILTINS_LOCATION,
7598 FIELD_DECL, get_identifier ("gp_offset"),
7599 unsigned_type_node);
7600 f_fpr = build_decl (BUILTINS_LOCATION,
7601 FIELD_DECL, get_identifier ("fp_offset"),
7602 unsigned_type_node);
7603 f_ovf = build_decl (BUILTINS_LOCATION,
7604 FIELD_DECL, get_identifier ("overflow_arg_area"),
7605 ptr_type_node);
7606 f_sav = build_decl (BUILTINS_LOCATION,
7607 FIELD_DECL, get_identifier ("reg_save_area"),
7608 ptr_type_node);
7610 va_list_gpr_counter_field = f_gpr;
7611 va_list_fpr_counter_field = f_fpr;
7613 DECL_FIELD_CONTEXT (f_gpr) = record;
7614 DECL_FIELD_CONTEXT (f_fpr) = record;
7615 DECL_FIELD_CONTEXT (f_ovf) = record;
7616 DECL_FIELD_CONTEXT (f_sav) = record;
7618 TYPE_STUB_DECL (record) = type_decl;
7619 TYPE_NAME (record) = type_decl;
7620 TYPE_FIELDS (record) = f_gpr;
7621 DECL_CHAIN (f_gpr) = f_fpr;
7622 DECL_CHAIN (f_fpr) = f_ovf;
7623 DECL_CHAIN (f_ovf) = f_sav;
7625 layout_type (record);
7627 /* The correct type is an array type of one element. */
7628 return build_array_type (record, build_index_type (size_zero_node));
7631 /* Setup the builtin va_list data type and for 64-bit the additional
7632 calling convention specific va_list data types. */
7634 static tree
7635 ix86_build_builtin_va_list (void)
7637 tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
7639 /* Initialize abi specific va_list builtin types. */
7640 if (TARGET_64BIT)
7642 tree t;
7643 if (ix86_abi == MS_ABI)
7645 t = ix86_build_builtin_va_list_abi (SYSV_ABI);
7646 if (TREE_CODE (t) != RECORD_TYPE)
7647 t = build_variant_type_copy (t);
7648 sysv_va_list_type_node = t;
7650 else
7652 t = ret;
7653 if (TREE_CODE (t) != RECORD_TYPE)
7654 t = build_variant_type_copy (t);
7655 sysv_va_list_type_node = t;
7657 if (ix86_abi != MS_ABI)
7659 t = ix86_build_builtin_va_list_abi (MS_ABI);
7660 if (TREE_CODE (t) != RECORD_TYPE)
7661 t = build_variant_type_copy (t);
7662 ms_va_list_type_node = t;
7664 else
7666 t = ret;
7667 if (TREE_CODE (t) != RECORD_TYPE)
7668 t = build_variant_type_copy (t);
7669 ms_va_list_type_node = t;
7673 return ret;
7676 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
7678 static void
7679 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
7681 rtx save_area, mem;
7682 alias_set_type set;
7683 int i, max;
7685 /* GPR size of varargs save area. */
7686 if (cfun->va_list_gpr_size)
7687 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
7688 else
7689 ix86_varargs_gpr_size = 0;
7691 /* FPR size of varargs save area. We don't need it if we don't pass
7692 anything in SSE registers. */
7693 if (TARGET_SSE && cfun->va_list_fpr_size)
7694 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
7695 else
7696 ix86_varargs_fpr_size = 0;
7698 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
7699 return;
7701 save_area = frame_pointer_rtx;
7702 set = get_varargs_alias_set ();
7704 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
7705 if (max > X86_64_REGPARM_MAX)
7706 max = X86_64_REGPARM_MAX;
7708 for (i = cum->regno; i < max; i++)
7710 mem = gen_rtx_MEM (word_mode,
7711 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
7712 MEM_NOTRAP_P (mem) = 1;
7713 set_mem_alias_set (mem, set);
7714 emit_move_insn (mem,
7715 gen_rtx_REG (word_mode,
7716 x86_64_int_parameter_registers[i]));
7719 if (ix86_varargs_fpr_size)
7721 enum machine_mode smode;
7722 rtx label, test;
7724 /* Now emit code to save SSE registers. The AX parameter contains number
7725 of SSE parameter registers used to call this function, though all we
7726 actually check here is the zero/non-zero status. */
7728 label = gen_label_rtx ();
7729 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
7730 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
7731 label));
7733 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
7734 we used movdqa (i.e. TImode) instead? Perhaps even better would
7735 be if we could determine the real mode of the data, via a hook
7736 into pass_stdarg. Ignore all that for now. */
7737 smode = V4SFmode;
7738 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
7739 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
7741 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
7742 if (max > X86_64_SSE_REGPARM_MAX)
7743 max = X86_64_SSE_REGPARM_MAX;
7745 for (i = cum->sse_regno; i < max; ++i)
7747 mem = plus_constant (Pmode, save_area,
7748 i * 16 + ix86_varargs_gpr_size);
7749 mem = gen_rtx_MEM (smode, mem);
7750 MEM_NOTRAP_P (mem) = 1;
7751 set_mem_alias_set (mem, set);
7752 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
7754 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
7757 emit_label (label);
7761 static void
7762 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
7764 alias_set_type set = get_varargs_alias_set ();
7765 int i;
7767 /* Reset to zero, as there might be a sysv vaarg used
7768 before. */
7769 ix86_varargs_gpr_size = 0;
7770 ix86_varargs_fpr_size = 0;
7772 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
7774 rtx reg, mem;
7776 mem = gen_rtx_MEM (Pmode,
7777 plus_constant (Pmode, virtual_incoming_args_rtx,
7778 i * UNITS_PER_WORD));
7779 MEM_NOTRAP_P (mem) = 1;
7780 set_mem_alias_set (mem, set);
7782 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
7783 emit_move_insn (mem, reg);
7787 static void
7788 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
7789 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7790 int no_rtl)
7792 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7793 CUMULATIVE_ARGS next_cum;
7794 tree fntype;
7796 /* This argument doesn't appear to be used anymore. Which is good,
7797 because the old code here didn't suppress rtl generation. */
7798 gcc_assert (!no_rtl);
7800 if (!TARGET_64BIT)
7801 return;
7803 fntype = TREE_TYPE (current_function_decl);
7805 /* For varargs, we do not want to skip the dummy va_dcl argument.
7806 For stdargs, we do want to skip the last named argument. */
7807 next_cum = *cum;
7808 if (stdarg_p (fntype))
7809 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
7810 true);
7812 if (cum->call_abi == MS_ABI)
7813 setup_incoming_varargs_ms_64 (&next_cum);
7814 else
7815 setup_incoming_varargs_64 (&next_cum);
7818 /* Checks if TYPE is of kind va_list char *. */
7820 static bool
7821 is_va_list_char_pointer (tree type)
7823 tree canonic;
7825 /* For 32-bit it is always true. */
7826 if (!TARGET_64BIT)
7827 return true;
7828 canonic = ix86_canonical_va_list_type (type);
7829 return (canonic == ms_va_list_type_node
7830 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
7833 /* Implement va_start. */
7835 static void
7836 ix86_va_start (tree valist, rtx nextarg)
7838 HOST_WIDE_INT words, n_gpr, n_fpr;
7839 tree f_gpr, f_fpr, f_ovf, f_sav;
7840 tree gpr, fpr, ovf, sav, t;
7841 tree type;
7842 rtx ovf_rtx;
7844 if (flag_split_stack
7845 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7847 unsigned int scratch_regno;
7849 /* When we are splitting the stack, we can't refer to the stack
7850 arguments using internal_arg_pointer, because they may be on
7851 the old stack. The split stack prologue will arrange to
7852 leave a pointer to the old stack arguments in a scratch
7853 register, which we here copy to a pseudo-register. The split
7854 stack prologue can't set the pseudo-register directly because
7855 it (the prologue) runs before any registers have been saved. */
7857 scratch_regno = split_stack_prologue_scratch_regno ();
7858 if (scratch_regno != INVALID_REGNUM)
7860 rtx reg, seq;
7862 reg = gen_reg_rtx (Pmode);
7863 cfun->machine->split_stack_varargs_pointer = reg;
7865 start_sequence ();
7866 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
7867 seq = get_insns ();
7868 end_sequence ();
7870 push_topmost_sequence ();
7871 emit_insn_after (seq, entry_of_function ());
7872 pop_topmost_sequence ();
7876 /* Only 64bit target needs something special. */
7877 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7879 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7880 std_expand_builtin_va_start (valist, nextarg);
7881 else
7883 rtx va_r, next;
7885 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
7886 next = expand_binop (ptr_mode, add_optab,
7887 cfun->machine->split_stack_varargs_pointer,
7888 crtl->args.arg_offset_rtx,
7889 NULL_RTX, 0, OPTAB_LIB_WIDEN);
7890 convert_move (va_r, next, 0);
7892 return;
7895 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7896 f_fpr = DECL_CHAIN (f_gpr);
7897 f_ovf = DECL_CHAIN (f_fpr);
7898 f_sav = DECL_CHAIN (f_ovf);
7900 valist = build_simple_mem_ref (valist);
7901 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
7902 /* The following should be folded into the MEM_REF offset. */
7903 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
7904 f_gpr, NULL_TREE);
7905 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
7906 f_fpr, NULL_TREE);
7907 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
7908 f_ovf, NULL_TREE);
7909 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
7910 f_sav, NULL_TREE);
7912 /* Count number of gp and fp argument registers used. */
7913 words = crtl->args.info.words;
7914 n_gpr = crtl->args.info.regno;
7915 n_fpr = crtl->args.info.sse_regno;
7917 if (cfun->va_list_gpr_size)
7919 type = TREE_TYPE (gpr);
7920 t = build2 (MODIFY_EXPR, type,
7921 gpr, build_int_cst (type, n_gpr * 8));
7922 TREE_SIDE_EFFECTS (t) = 1;
7923 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7926 if (TARGET_SSE && cfun->va_list_fpr_size)
7928 type = TREE_TYPE (fpr);
7929 t = build2 (MODIFY_EXPR, type, fpr,
7930 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
7931 TREE_SIDE_EFFECTS (t) = 1;
7932 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7935 /* Find the overflow area. */
7936 type = TREE_TYPE (ovf);
7937 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7938 ovf_rtx = crtl->args.internal_arg_pointer;
7939 else
7940 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
7941 t = make_tree (type, ovf_rtx);
7942 if (words != 0)
7943 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
7944 t = build2 (MODIFY_EXPR, type, ovf, t);
7945 TREE_SIDE_EFFECTS (t) = 1;
7946 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7948 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
7950 /* Find the register save area.
7951 Prologue of the function save it right above stack frame. */
7952 type = TREE_TYPE (sav);
7953 t = make_tree (type, frame_pointer_rtx);
7954 if (!ix86_varargs_gpr_size)
7955 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
7956 t = build2 (MODIFY_EXPR, type, sav, t);
7957 TREE_SIDE_EFFECTS (t) = 1;
7958 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7962 /* Implement va_arg. */
7964 static tree
7965 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
7966 gimple_seq *post_p)
7968 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
7969 tree f_gpr, f_fpr, f_ovf, f_sav;
7970 tree gpr, fpr, ovf, sav, t;
7971 int size, rsize;
7972 tree lab_false, lab_over = NULL_TREE;
7973 tree addr, t2;
7974 rtx container;
7975 int indirect_p = 0;
7976 tree ptrtype;
7977 enum machine_mode nat_mode;
7978 unsigned int arg_boundary;
7980 /* Only 64bit target needs something special. */
7981 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7982 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
7984 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7985 f_fpr = DECL_CHAIN (f_gpr);
7986 f_ovf = DECL_CHAIN (f_fpr);
7987 f_sav = DECL_CHAIN (f_ovf);
7989 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
7990 build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
7991 valist = build_va_arg_indirect_ref (valist);
7992 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
7993 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
7994 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
7996 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7997 if (indirect_p)
7998 type = build_pointer_type (type);
7999 size = int_size_in_bytes (type);
8000 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
8002 nat_mode = type_natural_mode (type, NULL);
8003 switch (nat_mode)
8005 case V8SFmode:
8006 case V8SImode:
8007 case V32QImode:
8008 case V16HImode:
8009 case V4DFmode:
8010 case V4DImode:
8011 /* Unnamed 256bit vector mode parameters are passed on stack. */
8012 if (!TARGET_64BIT_MS_ABI)
8014 container = NULL;
8015 break;
8018 default:
8019 container = construct_container (nat_mode, TYPE_MODE (type),
8020 type, 0, X86_64_REGPARM_MAX,
8021 X86_64_SSE_REGPARM_MAX, intreg,
8023 break;
8026 /* Pull the value out of the saved registers. */
8028 addr = create_tmp_var (ptr_type_node, "addr");
8030 if (container)
8032 int needed_intregs, needed_sseregs;
8033 bool need_temp;
8034 tree int_addr, sse_addr;
8036 lab_false = create_artificial_label (UNKNOWN_LOCATION);
8037 lab_over = create_artificial_label (UNKNOWN_LOCATION);
8039 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
8041 need_temp = (!REG_P (container)
8042 && ((needed_intregs && TYPE_ALIGN (type) > 64)
8043 || TYPE_ALIGN (type) > 128));
8045 /* In case we are passing structure, verify that it is consecutive block
8046 on the register save area. If not we need to do moves. */
8047 if (!need_temp && !REG_P (container))
8049 /* Verify that all registers are strictly consecutive */
8050 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
8052 int i;
8054 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8056 rtx slot = XVECEXP (container, 0, i);
8057 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
8058 || INTVAL (XEXP (slot, 1)) != i * 16)
8059 need_temp = 1;
8062 else
8064 int i;
8066 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8068 rtx slot = XVECEXP (container, 0, i);
8069 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
8070 || INTVAL (XEXP (slot, 1)) != i * 8)
8071 need_temp = 1;
8075 if (!need_temp)
8077 int_addr = addr;
8078 sse_addr = addr;
8080 else
8082 int_addr = create_tmp_var (ptr_type_node, "int_addr");
8083 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
8086 /* First ensure that we fit completely in registers. */
8087 if (needed_intregs)
8089 t = build_int_cst (TREE_TYPE (gpr),
8090 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
8091 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
8092 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8093 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8094 gimplify_and_add (t, pre_p);
8096 if (needed_sseregs)
8098 t = build_int_cst (TREE_TYPE (fpr),
8099 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
8100 + X86_64_REGPARM_MAX * 8);
8101 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
8102 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8103 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8104 gimplify_and_add (t, pre_p);
8107 /* Compute index to start of area used for integer regs. */
8108 if (needed_intregs)
8110 /* int_addr = gpr + sav; */
8111 t = fold_build_pointer_plus (sav, gpr);
8112 gimplify_assign (int_addr, t, pre_p);
8114 if (needed_sseregs)
8116 /* sse_addr = fpr + sav; */
8117 t = fold_build_pointer_plus (sav, fpr);
8118 gimplify_assign (sse_addr, t, pre_p);
8120 if (need_temp)
8122 int i, prev_size = 0;
8123 tree temp = create_tmp_var (type, "va_arg_tmp");
8125 /* addr = &temp; */
8126 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
8127 gimplify_assign (addr, t, pre_p);
8129 for (i = 0; i < XVECLEN (container, 0); i++)
8131 rtx slot = XVECEXP (container, 0, i);
8132 rtx reg = XEXP (slot, 0);
8133 enum machine_mode mode = GET_MODE (reg);
8134 tree piece_type;
8135 tree addr_type;
8136 tree daddr_type;
8137 tree src_addr, src;
8138 int src_offset;
8139 tree dest_addr, dest;
8140 int cur_size = GET_MODE_SIZE (mode);
8142 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
8143 prev_size = INTVAL (XEXP (slot, 1));
8144 if (prev_size + cur_size > size)
8146 cur_size = size - prev_size;
8147 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
8148 if (mode == BLKmode)
8149 mode = QImode;
8151 piece_type = lang_hooks.types.type_for_mode (mode, 1);
8152 if (mode == GET_MODE (reg))
8153 addr_type = build_pointer_type (piece_type);
8154 else
8155 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8156 true);
8157 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8158 true);
8160 if (SSE_REGNO_P (REGNO (reg)))
8162 src_addr = sse_addr;
8163 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
8165 else
8167 src_addr = int_addr;
8168 src_offset = REGNO (reg) * 8;
8170 src_addr = fold_convert (addr_type, src_addr);
8171 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
8173 dest_addr = fold_convert (daddr_type, addr);
8174 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
8175 if (cur_size == GET_MODE_SIZE (mode))
8177 src = build_va_arg_indirect_ref (src_addr);
8178 dest = build_va_arg_indirect_ref (dest_addr);
8180 gimplify_assign (dest, src, pre_p);
8182 else
8184 tree copy
8185 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
8186 3, dest_addr, src_addr,
8187 size_int (cur_size));
8188 gimplify_and_add (copy, pre_p);
8190 prev_size += cur_size;
8194 if (needed_intregs)
8196 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
8197 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
8198 gimplify_assign (gpr, t, pre_p);
8201 if (needed_sseregs)
8203 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
8204 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
8205 gimplify_assign (fpr, t, pre_p);
8208 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
8210 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
8213 /* ... otherwise out of the overflow area. */
8215 /* When we align parameter on stack for caller, if the parameter
8216 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
8217 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
8218 here with caller. */
8219 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
8220 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
8221 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
8223 /* Care for on-stack alignment if needed. */
8224 if (arg_boundary <= 64 || size == 0)
8225 t = ovf;
8226 else
8228 HOST_WIDE_INT align = arg_boundary / 8;
8229 t = fold_build_pointer_plus_hwi (ovf, align - 1);
8230 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8231 build_int_cst (TREE_TYPE (t), -align));
8234 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
8235 gimplify_assign (addr, t, pre_p);
8237 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
8238 gimplify_assign (unshare_expr (ovf), t, pre_p);
8240 if (container)
8241 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
8243 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
8244 addr = fold_convert (ptrtype, addr);
8246 if (indirect_p)
8247 addr = build_va_arg_indirect_ref (addr);
8248 return build_va_arg_indirect_ref (addr);
8251 /* Return true if OPNUM's MEM should be matched
8252 in movabs* patterns. */
8254 bool
8255 ix86_check_movabs (rtx insn, int opnum)
8257 rtx set, mem;
8259 set = PATTERN (insn);
8260 if (GET_CODE (set) == PARALLEL)
8261 set = XVECEXP (set, 0, 0);
8262 gcc_assert (GET_CODE (set) == SET);
8263 mem = XEXP (set, opnum);
8264 while (GET_CODE (mem) == SUBREG)
8265 mem = SUBREG_REG (mem);
8266 gcc_assert (MEM_P (mem));
8267 return volatile_ok || !MEM_VOLATILE_P (mem);
8270 /* Initialize the table of extra 80387 mathematical constants. */
8272 static void
8273 init_ext_80387_constants (void)
8275 static const char * cst[5] =
8277 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
8278 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
8279 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
8280 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
8281 "3.1415926535897932385128089594061862044", /* 4: fldpi */
8283 int i;
8285 for (i = 0; i < 5; i++)
8287 real_from_string (&ext_80387_constants_table[i], cst[i]);
8288 /* Ensure each constant is rounded to XFmode precision. */
8289 real_convert (&ext_80387_constants_table[i],
8290 XFmode, &ext_80387_constants_table[i]);
8293 ext_80387_constants_init = 1;
8296 /* Return non-zero if the constant is something that
8297 can be loaded with a special instruction. */
8300 standard_80387_constant_p (rtx x)
8302 enum machine_mode mode = GET_MODE (x);
8304 REAL_VALUE_TYPE r;
8306 if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8307 return -1;
8309 if (x == CONST0_RTX (mode))
8310 return 1;
8311 if (x == CONST1_RTX (mode))
8312 return 2;
8314 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8316 /* For XFmode constants, try to find a special 80387 instruction when
8317 optimizing for size or on those CPUs that benefit from them. */
8318 if (mode == XFmode
8319 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8321 int i;
8323 if (! ext_80387_constants_init)
8324 init_ext_80387_constants ();
8326 for (i = 0; i < 5; i++)
8327 if (real_identical (&r, &ext_80387_constants_table[i]))
8328 return i + 3;
8331 /* Load of the constant -0.0 or -1.0 will be split as
8332 fldz;fchs or fld1;fchs sequence. */
8333 if (real_isnegzero (&r))
8334 return 8;
8335 if (real_identical (&r, &dconstm1))
8336 return 9;
8338 return 0;
8341 /* Return the opcode of the special instruction to be used to load
8342 the constant X. */
8344 const char *
8345 standard_80387_constant_opcode (rtx x)
8347 switch (standard_80387_constant_p (x))
8349 case 1:
8350 return "fldz";
8351 case 2:
8352 return "fld1";
8353 case 3:
8354 return "fldlg2";
8355 case 4:
8356 return "fldln2";
8357 case 5:
8358 return "fldl2e";
8359 case 6:
8360 return "fldl2t";
8361 case 7:
8362 return "fldpi";
8363 case 8:
8364 case 9:
8365 return "#";
8366 default:
8367 gcc_unreachable ();
8371 /* Return the CONST_DOUBLE representing the 80387 constant that is
8372 loaded by the specified special instruction. The argument IDX
8373 matches the return value from standard_80387_constant_p. */
8376 standard_80387_constant_rtx (int idx)
8378 int i;
8380 if (! ext_80387_constants_init)
8381 init_ext_80387_constants ();
8383 switch (idx)
8385 case 3:
8386 case 4:
8387 case 5:
8388 case 6:
8389 case 7:
8390 i = idx - 3;
8391 break;
8393 default:
8394 gcc_unreachable ();
8397 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8398 XFmode);
8401 /* Return 1 if X is all 0s and 2 if x is all 1s
8402 in supported SSE/AVX vector mode. */
8405 standard_sse_constant_p (rtx x)
8407 enum machine_mode mode = GET_MODE (x);
8409 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8410 return 1;
8411 if (vector_all_ones_operand (x, mode))
8412 switch (mode)
8414 case V16QImode:
8415 case V8HImode:
8416 case V4SImode:
8417 case V2DImode:
8418 if (TARGET_SSE2)
8419 return 2;
8420 case V32QImode:
8421 case V16HImode:
8422 case V8SImode:
8423 case V4DImode:
8424 if (TARGET_AVX2)
8425 return 2;
8426 default:
8427 break;
8430 return 0;
8433 /* Return the opcode of the special instruction to be used to load
8434 the constant X. */
8436 const char *
8437 standard_sse_constant_opcode (rtx insn, rtx x)
8439 switch (standard_sse_constant_p (x))
8441 case 1:
8442 switch (get_attr_mode (insn))
8444 case MODE_TI:
8445 return "%vpxor\t%0, %d0";
8446 case MODE_V2DF:
8447 return "%vxorpd\t%0, %d0";
8448 case MODE_V4SF:
8449 return "%vxorps\t%0, %d0";
8451 case MODE_OI:
8452 return "vpxor\t%x0, %x0, %x0";
8453 case MODE_V4DF:
8454 return "vxorpd\t%x0, %x0, %x0";
8455 case MODE_V8SF:
8456 return "vxorps\t%x0, %x0, %x0";
8458 default:
8459 break;
8462 case 2:
8463 if (TARGET_AVX)
8464 return "vpcmpeqd\t%0, %0, %0";
8465 else
8466 return "pcmpeqd\t%0, %0";
8468 default:
8469 break;
8471 gcc_unreachable ();
8474 /* Returns true if OP contains a symbol reference */
8476 bool
8477 symbolic_reference_mentioned_p (rtx op)
8479 const char *fmt;
8480 int i;
8482 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8483 return true;
8485 fmt = GET_RTX_FORMAT (GET_CODE (op));
8486 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8488 if (fmt[i] == 'E')
8490 int j;
8492 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8493 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8494 return true;
8497 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8498 return true;
8501 return false;
8504 /* Return true if it is appropriate to emit `ret' instructions in the
8505 body of a function. Do this only if the epilogue is simple, needing a
8506 couple of insns. Prior to reloading, we can't tell how many registers
8507 must be saved, so return false then. Return false if there is no frame
8508 marker to de-allocate. */
8510 bool
8511 ix86_can_use_return_insn_p (void)
8513 struct ix86_frame frame;
8515 if (! reload_completed || frame_pointer_needed)
8516 return 0;
8518 /* Don't allow more than 32k pop, since that's all we can do
8519 with one instruction. */
8520 if (crtl->args.pops_args && crtl->args.size >= 32768)
8521 return 0;
8523 ix86_compute_frame_layout (&frame);
8524 return (frame.stack_pointer_offset == UNITS_PER_WORD
8525 && (frame.nregs + frame.nsseregs) == 0);
8528 /* Value should be nonzero if functions must have frame pointers.
8529 Zero means the frame pointer need not be set up (and parms may
8530 be accessed via the stack pointer) in functions that seem suitable. */
8532 static bool
8533 ix86_frame_pointer_required (void)
8535 /* If we accessed previous frames, then the generated code expects
8536 to be able to access the saved ebp value in our frame. */
8537 if (cfun->machine->accesses_prev_frame)
8538 return true;
8540 /* Several x86 os'es need a frame pointer for other reasons,
8541 usually pertaining to setjmp. */
8542 if (SUBTARGET_FRAME_POINTER_REQUIRED)
8543 return true;
8545 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
8546 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8547 return true;
8549 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
8550 allocation is 4GB. */
8551 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
8552 return true;
8554 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8555 turns off the frame pointer by default. Turn it back on now if
8556 we've not got a leaf function. */
8557 if (TARGET_OMIT_LEAF_FRAME_POINTER
8558 && (!crtl->is_leaf
8559 || ix86_current_function_calls_tls_descriptor))
8560 return true;
8562 if (crtl->profile && !flag_fentry)
8563 return true;
8565 return false;
8568 /* Record that the current function accesses previous call frames. */
8570 void
8571 ix86_setup_frame_addresses (void)
8573 cfun->machine->accesses_prev_frame = 1;
8576 #ifndef USE_HIDDEN_LINKONCE
8577 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8578 # define USE_HIDDEN_LINKONCE 1
8579 # else
8580 # define USE_HIDDEN_LINKONCE 0
8581 # endif
8582 #endif
8584 static int pic_labels_used;
8586 /* Fills in the label name that should be used for a pc thunk for
8587 the given register. */
8589 static void
8590 get_pc_thunk_name (char name[32], unsigned int regno)
8592 gcc_assert (!TARGET_64BIT);
8594 if (USE_HIDDEN_LINKONCE)
8595 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
8596 else
8597 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8601 /* This function generates code for -fpic that loads %ebx with
8602 the return address of the caller and then returns. */
8604 static void
8605 ix86_code_end (void)
8607 rtx xops[2];
8608 int regno;
8610 for (regno = AX_REG; regno <= SP_REG; regno++)
8612 char name[32];
8613 tree decl;
8615 if (!(pic_labels_used & (1 << regno)))
8616 continue;
8618 get_pc_thunk_name (name, regno);
8620 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
8621 get_identifier (name),
8622 build_function_type_list (void_type_node, NULL_TREE));
8623 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
8624 NULL_TREE, void_type_node);
8625 TREE_PUBLIC (decl) = 1;
8626 TREE_STATIC (decl) = 1;
8627 DECL_IGNORED_P (decl) = 1;
8629 #if TARGET_MACHO
8630 if (TARGET_MACHO)
8632 switch_to_section (darwin_sections[text_coal_section]);
8633 fputs ("\t.weak_definition\t", asm_out_file);
8634 assemble_name (asm_out_file, name);
8635 fputs ("\n\t.private_extern\t", asm_out_file);
8636 assemble_name (asm_out_file, name);
8637 putc ('\n', asm_out_file);
8638 ASM_OUTPUT_LABEL (asm_out_file, name);
8639 DECL_WEAK (decl) = 1;
8641 else
8642 #endif
8643 if (USE_HIDDEN_LINKONCE)
8645 DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
8647 targetm.asm_out.unique_section (decl, 0);
8648 switch_to_section (get_named_section (decl, NULL, 0));
8650 targetm.asm_out.globalize_label (asm_out_file, name);
8651 fputs ("\t.hidden\t", asm_out_file);
8652 assemble_name (asm_out_file, name);
8653 putc ('\n', asm_out_file);
8654 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
8656 else
8658 switch_to_section (text_section);
8659 ASM_OUTPUT_LABEL (asm_out_file, name);
8662 DECL_INITIAL (decl) = make_node (BLOCK);
8663 current_function_decl = decl;
8664 init_function_start (decl);
8665 first_function_block_is_cold = false;
8666 /* Make sure unwind info is emitted for the thunk if needed. */
8667 final_start_function (emit_barrier (), asm_out_file, 1);
8669 /* Pad stack IP move with 4 instructions (two NOPs count
8670 as one instruction). */
8671 if (TARGET_PAD_SHORT_FUNCTION)
8673 int i = 8;
8675 while (i--)
8676 fputs ("\tnop\n", asm_out_file);
8679 xops[0] = gen_rtx_REG (Pmode, regno);
8680 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
8681 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
8682 fputs ("\tret\n", asm_out_file);
8683 final_end_function ();
8684 init_insn_lengths ();
8685 free_after_compilation (cfun);
8686 set_cfun (NULL);
8687 current_function_decl = NULL;
8690 if (flag_split_stack)
8691 file_end_indicate_split_stack ();
8694 /* Emit code for the SET_GOT patterns. */
8696 const char *
8697 output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
8699 rtx xops[3];
8701 xops[0] = dest;
8703 if (TARGET_VXWORKS_RTP && flag_pic)
8705 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
8706 xops[2] = gen_rtx_MEM (Pmode,
8707 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
8708 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
8710 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
8711 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
8712 an unadorned address. */
8713 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
8714 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
8715 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
8716 return "";
8719 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
8721 if (!flag_pic)
8723 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
8725 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
8727 #if TARGET_MACHO
8728 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8729 is what will be referenced by the Mach-O PIC subsystem. */
8730 if (!label)
8731 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8732 #endif
8734 targetm.asm_out.internal_label (asm_out_file, "L",
8735 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
8737 else
8739 char name[32];
8740 get_pc_thunk_name (name, REGNO (dest));
8741 pic_labels_used |= 1 << REGNO (dest);
8743 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
8744 xops[2] = gen_rtx_MEM (QImode, xops[2]);
8745 output_asm_insn ("call\t%X2", xops);
8746 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8747 is what will be referenced by the Mach-O PIC subsystem. */
8748 #if TARGET_MACHO
8749 if (!label)
8750 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8751 else
8752 targetm.asm_out.internal_label (asm_out_file, "L",
8753 CODE_LABEL_NUMBER (label));
8754 #endif
8757 if (!TARGET_MACHO)
8758 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
8760 return "";
8763 /* Generate an "push" pattern for input ARG. */
8765 static rtx
8766 gen_push (rtx arg)
8768 struct machine_function *m = cfun->machine;
8770 if (m->fs.cfa_reg == stack_pointer_rtx)
8771 m->fs.cfa_offset += UNITS_PER_WORD;
8772 m->fs.sp_offset += UNITS_PER_WORD;
8774 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8775 arg = gen_rtx_REG (word_mode, REGNO (arg));
8777 return gen_rtx_SET (VOIDmode,
8778 gen_rtx_MEM (word_mode,
8779 gen_rtx_PRE_DEC (Pmode,
8780 stack_pointer_rtx)),
8781 arg);
8784 /* Generate an "pop" pattern for input ARG. */
8786 static rtx
8787 gen_pop (rtx arg)
8789 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8790 arg = gen_rtx_REG (word_mode, REGNO (arg));
8792 return gen_rtx_SET (VOIDmode,
8793 arg,
8794 gen_rtx_MEM (word_mode,
8795 gen_rtx_POST_INC (Pmode,
8796 stack_pointer_rtx)));
8799 /* Return >= 0 if there is an unused call-clobbered register available
8800 for the entire function. */
8802 static unsigned int
8803 ix86_select_alt_pic_regnum (void)
8805 if (crtl->is_leaf
8806 && !crtl->profile
8807 && !ix86_current_function_calls_tls_descriptor)
8809 int i, drap;
8810 /* Can't use the same register for both PIC and DRAP. */
8811 if (crtl->drap_reg)
8812 drap = REGNO (crtl->drap_reg);
8813 else
8814 drap = -1;
8815 for (i = 2; i >= 0; --i)
8816 if (i != drap && !df_regs_ever_live_p (i))
8817 return i;
8820 return INVALID_REGNUM;
8823 /* Return TRUE if we need to save REGNO. */
8825 static bool
8826 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
8828 if (pic_offset_table_rtx
8829 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
8830 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
8831 || crtl->profile
8832 || crtl->calls_eh_return
8833 || crtl->uses_const_pool))
8834 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
8836 if (crtl->calls_eh_return && maybe_eh_return)
8838 unsigned i;
8839 for (i = 0; ; i++)
8841 unsigned test = EH_RETURN_DATA_REGNO (i);
8842 if (test == INVALID_REGNUM)
8843 break;
8844 if (test == regno)
8845 return true;
8849 if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
8850 return true;
8852 return (df_regs_ever_live_p (regno)
8853 && !call_used_regs[regno]
8854 && !fixed_regs[regno]
8855 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
8858 /* Return number of saved general prupose registers. */
8860 static int
8861 ix86_nsaved_regs (void)
8863 int nregs = 0;
8864 int regno;
8866 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8867 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8868 nregs ++;
8869 return nregs;
8872 /* Return number of saved SSE registrers. */
8874 static int
8875 ix86_nsaved_sseregs (void)
8877 int nregs = 0;
8878 int regno;
8880 if (!TARGET_64BIT_MS_ABI)
8881 return 0;
8882 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8883 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8884 nregs ++;
8885 return nregs;
8888 /* Given FROM and TO register numbers, say whether this elimination is
8889 allowed. If stack alignment is needed, we can only replace argument
8890 pointer with hard frame pointer, or replace frame pointer with stack
8891 pointer. Otherwise, frame pointer elimination is automatically
8892 handled and all other eliminations are valid. */
8894 static bool
8895 ix86_can_eliminate (const int from, const int to)
8897 if (stack_realign_fp)
8898 return ((from == ARG_POINTER_REGNUM
8899 && to == HARD_FRAME_POINTER_REGNUM)
8900 || (from == FRAME_POINTER_REGNUM
8901 && to == STACK_POINTER_REGNUM));
8902 else
8903 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
8906 /* Return the offset between two registers, one to be eliminated, and the other
8907 its replacement, at the start of a routine. */
8909 HOST_WIDE_INT
8910 ix86_initial_elimination_offset (int from, int to)
8912 struct ix86_frame frame;
8913 ix86_compute_frame_layout (&frame);
8915 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
8916 return frame.hard_frame_pointer_offset;
8917 else if (from == FRAME_POINTER_REGNUM
8918 && to == HARD_FRAME_POINTER_REGNUM)
8919 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
8920 else
8922 gcc_assert (to == STACK_POINTER_REGNUM);
8924 if (from == ARG_POINTER_REGNUM)
8925 return frame.stack_pointer_offset;
8927 gcc_assert (from == FRAME_POINTER_REGNUM);
8928 return frame.stack_pointer_offset - frame.frame_pointer_offset;
8932 /* In a dynamically-aligned function, we can't know the offset from
8933 stack pointer to frame pointer, so we must ensure that setjmp
8934 eliminates fp against the hard fp (%ebp) rather than trying to
8935 index from %esp up to the top of the frame across a gap that is
8936 of unknown (at compile-time) size. */
8937 static rtx
8938 ix86_builtin_setjmp_frame_value (void)
8940 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
8943 /* When using -fsplit-stack, the allocation routines set a field in
8944 the TCB to the bottom of the stack plus this much space, measured
8945 in bytes. */
8947 #define SPLIT_STACK_AVAILABLE 256
8949 /* Fill structure ix86_frame about frame of currently computed function. */
8951 static void
8952 ix86_compute_frame_layout (struct ix86_frame *frame)
8954 unsigned HOST_WIDE_INT stack_alignment_needed;
8955 HOST_WIDE_INT offset;
8956 unsigned HOST_WIDE_INT preferred_alignment;
8957 HOST_WIDE_INT size = get_frame_size ();
8958 HOST_WIDE_INT to_allocate;
8960 frame->nregs = ix86_nsaved_regs ();
8961 frame->nsseregs = ix86_nsaved_sseregs ();
8963 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
8964 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
8966 /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
8967 function prologues and leaf. */
8968 if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
8969 && (!crtl->is_leaf || cfun->calls_alloca != 0
8970 || ix86_current_function_calls_tls_descriptor))
8972 preferred_alignment = 16;
8973 stack_alignment_needed = 16;
8974 crtl->preferred_stack_boundary = 128;
8975 crtl->stack_alignment_needed = 128;
8978 gcc_assert (!size || stack_alignment_needed);
8979 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
8980 gcc_assert (preferred_alignment <= stack_alignment_needed);
8982 /* For SEH we have to limit the amount of code movement into the prologue.
8983 At present we do this via a BLOCKAGE, at which point there's very little
8984 scheduling that can be done, which means that there's very little point
8985 in doing anything except PUSHs. */
8986 if (TARGET_SEH)
8987 cfun->machine->use_fast_prologue_epilogue = false;
8989 /* During reload iteration the amount of registers saved can change.
8990 Recompute the value as needed. Do not recompute when amount of registers
8991 didn't change as reload does multiple calls to the function and does not
8992 expect the decision to change within single iteration. */
8993 else if (!optimize_function_for_size_p (cfun)
8994 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
8996 int count = frame->nregs;
8997 struct cgraph_node *node = cgraph_get_node (current_function_decl);
8999 cfun->machine->use_fast_prologue_epilogue_nregs = count;
9001 /* The fast prologue uses move instead of push to save registers. This
9002 is significantly longer, but also executes faster as modern hardware
9003 can execute the moves in parallel, but can't do that for push/pop.
9005 Be careful about choosing what prologue to emit: When function takes
9006 many instructions to execute we may use slow version as well as in
9007 case function is known to be outside hot spot (this is known with
9008 feedback only). Weight the size of function by number of registers
9009 to save as it is cheap to use one or two push instructions but very
9010 slow to use many of them. */
9011 if (count)
9012 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
9013 if (node->frequency < NODE_FREQUENCY_NORMAL
9014 || (flag_branch_probabilities
9015 && node->frequency < NODE_FREQUENCY_HOT))
9016 cfun->machine->use_fast_prologue_epilogue = false;
9017 else
9018 cfun->machine->use_fast_prologue_epilogue
9019 = !expensive_function_p (count);
9022 frame->save_regs_using_mov
9023 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
9024 /* If static stack checking is enabled and done with probes,
9025 the registers need to be saved before allocating the frame. */
9026 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
9028 /* Skip return address. */
9029 offset = UNITS_PER_WORD;
9031 /* Skip pushed static chain. */
9032 if (ix86_static_chain_on_stack)
9033 offset += UNITS_PER_WORD;
9035 /* Skip saved base pointer. */
9036 if (frame_pointer_needed)
9037 offset += UNITS_PER_WORD;
9038 frame->hfp_save_offset = offset;
9040 /* The traditional frame pointer location is at the top of the frame. */
9041 frame->hard_frame_pointer_offset = offset;
9043 /* Register save area */
9044 offset += frame->nregs * UNITS_PER_WORD;
9045 frame->reg_save_offset = offset;
9047 /* On SEH target, registers are pushed just before the frame pointer
9048 location. */
9049 if (TARGET_SEH)
9050 frame->hard_frame_pointer_offset = offset;
9052 /* Align and set SSE register save area. */
9053 if (frame->nsseregs)
9055 /* The only ABI that has saved SSE registers (Win64) also has a
9056 16-byte aligned default stack, and thus we don't need to be
9057 within the re-aligned local stack frame to save them. */
9058 gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
9059 offset = (offset + 16 - 1) & -16;
9060 offset += frame->nsseregs * 16;
9062 frame->sse_reg_save_offset = offset;
9064 /* The re-aligned stack starts here. Values before this point are not
9065 directly comparable with values below this point. In order to make
9066 sure that no value happens to be the same before and after, force
9067 the alignment computation below to add a non-zero value. */
9068 if (stack_realign_fp)
9069 offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
9071 /* Va-arg area */
9072 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
9073 offset += frame->va_arg_size;
9075 /* Align start of frame for local function. */
9076 if (stack_realign_fp
9077 || offset != frame->sse_reg_save_offset
9078 || size != 0
9079 || !crtl->is_leaf
9080 || cfun->calls_alloca
9081 || ix86_current_function_calls_tls_descriptor)
9082 offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
9084 /* Frame pointer points here. */
9085 frame->frame_pointer_offset = offset;
9087 offset += size;
9089 /* Add outgoing arguments area. Can be skipped if we eliminated
9090 all the function calls as dead code.
9091 Skipping is however impossible when function calls alloca. Alloca
9092 expander assumes that last crtl->outgoing_args_size
9093 of stack frame are unused. */
9094 if (ACCUMULATE_OUTGOING_ARGS
9095 && (!crtl->is_leaf || cfun->calls_alloca
9096 || ix86_current_function_calls_tls_descriptor))
9098 offset += crtl->outgoing_args_size;
9099 frame->outgoing_arguments_size = crtl->outgoing_args_size;
9101 else
9102 frame->outgoing_arguments_size = 0;
9104 /* Align stack boundary. Only needed if we're calling another function
9105 or using alloca. */
9106 if (!crtl->is_leaf || cfun->calls_alloca
9107 || ix86_current_function_calls_tls_descriptor)
9108 offset = (offset + preferred_alignment - 1) & -preferred_alignment;
9110 /* We've reached end of stack frame. */
9111 frame->stack_pointer_offset = offset;
9113 /* Size prologue needs to allocate. */
9114 to_allocate = offset - frame->sse_reg_save_offset;
9116 if ((!to_allocate && frame->nregs <= 1)
9117 || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
9118 frame->save_regs_using_mov = false;
9120 if (ix86_using_red_zone ()
9121 && crtl->sp_is_unchanging
9122 && crtl->is_leaf
9123 && !ix86_current_function_calls_tls_descriptor)
9125 frame->red_zone_size = to_allocate;
9126 if (frame->save_regs_using_mov)
9127 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
9128 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
9129 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
9131 else
9132 frame->red_zone_size = 0;
9133 frame->stack_pointer_offset -= frame->red_zone_size;
9135 /* The SEH frame pointer location is near the bottom of the frame.
9136 This is enforced by the fact that the difference between the
9137 stack pointer and the frame pointer is limited to 240 bytes in
9138 the unwind data structure. */
9139 if (TARGET_SEH)
9141 HOST_WIDE_INT diff;
9143 /* If we can leave the frame pointer where it is, do so. Also, returns
9144 the establisher frame for __builtin_frame_address (0). */
9145 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
9146 if (diff <= SEH_MAX_FRAME_SIZE
9147 && (diff > 240 || (diff & 15) != 0)
9148 && !crtl->accesses_prior_frames)
9150 /* Ideally we'd determine what portion of the local stack frame
9151 (within the constraint of the lowest 240) is most heavily used.
9152 But without that complication, simply bias the frame pointer
9153 by 128 bytes so as to maximize the amount of the local stack
9154 frame that is addressable with 8-bit offsets. */
9155 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
9160 /* This is semi-inlined memory_address_length, but simplified
9161 since we know that we're always dealing with reg+offset, and
9162 to avoid having to create and discard all that rtl. */
9164 static inline int
9165 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
9167 int len = 4;
9169 if (offset == 0)
9171 /* EBP and R13 cannot be encoded without an offset. */
9172 len = (regno == BP_REG || regno == R13_REG);
9174 else if (IN_RANGE (offset, -128, 127))
9175 len = 1;
9177 /* ESP and R12 must be encoded with a SIB byte. */
9178 if (regno == SP_REG || regno == R12_REG)
9179 len++;
9181 return len;
9184 /* Return an RTX that points to CFA_OFFSET within the stack frame.
9185 The valid base registers are taken from CFUN->MACHINE->FS. */
9187 static rtx
9188 choose_baseaddr (HOST_WIDE_INT cfa_offset)
9190 const struct machine_function *m = cfun->machine;
9191 rtx base_reg = NULL;
9192 HOST_WIDE_INT base_offset = 0;
9194 if (m->use_fast_prologue_epilogue)
9196 /* Choose the base register most likely to allow the most scheduling
9197 opportunities. Generally FP is valid throughout the function,
9198 while DRAP must be reloaded within the epilogue. But choose either
9199 over the SP due to increased encoding size. */
9201 if (m->fs.fp_valid)
9203 base_reg = hard_frame_pointer_rtx;
9204 base_offset = m->fs.fp_offset - cfa_offset;
9206 else if (m->fs.drap_valid)
9208 base_reg = crtl->drap_reg;
9209 base_offset = 0 - cfa_offset;
9211 else if (m->fs.sp_valid)
9213 base_reg = stack_pointer_rtx;
9214 base_offset = m->fs.sp_offset - cfa_offset;
9217 else
9219 HOST_WIDE_INT toffset;
9220 int len = 16, tlen;
9222 /* Choose the base register with the smallest address encoding.
9223 With a tie, choose FP > DRAP > SP. */
9224 if (m->fs.sp_valid)
9226 base_reg = stack_pointer_rtx;
9227 base_offset = m->fs.sp_offset - cfa_offset;
9228 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
9230 if (m->fs.drap_valid)
9232 toffset = 0 - cfa_offset;
9233 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
9234 if (tlen <= len)
9236 base_reg = crtl->drap_reg;
9237 base_offset = toffset;
9238 len = tlen;
9241 if (m->fs.fp_valid)
9243 toffset = m->fs.fp_offset - cfa_offset;
9244 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
9245 if (tlen <= len)
9247 base_reg = hard_frame_pointer_rtx;
9248 base_offset = toffset;
9249 len = tlen;
9253 gcc_assert (base_reg != NULL);
9255 return plus_constant (Pmode, base_reg, base_offset);
9258 /* Emit code to save registers in the prologue. */
9260 static void
9261 ix86_emit_save_regs (void)
9263 unsigned int regno;
9264 rtx insn;
9266 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9267 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9269 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
9270 RTX_FRAME_RELATED_P (insn) = 1;
9274 /* Emit a single register save at CFA - CFA_OFFSET. */
9276 static void
9277 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9278 HOST_WIDE_INT cfa_offset)
9280 struct machine_function *m = cfun->machine;
9281 rtx reg = gen_rtx_REG (mode, regno);
9282 rtx mem, addr, base, insn;
9284 addr = choose_baseaddr (cfa_offset);
9285 mem = gen_frame_mem (mode, addr);
9287 /* For SSE saves, we need to indicate the 128-bit alignment. */
9288 set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9290 insn = emit_move_insn (mem, reg);
9291 RTX_FRAME_RELATED_P (insn) = 1;
9293 base = addr;
9294 if (GET_CODE (base) == PLUS)
9295 base = XEXP (base, 0);
9296 gcc_checking_assert (REG_P (base));
9298 /* When saving registers into a re-aligned local stack frame, avoid
9299 any tricky guessing by dwarf2out. */
9300 if (m->fs.realigned)
9302 gcc_checking_assert (stack_realign_drap);
9304 if (regno == REGNO (crtl->drap_reg))
9306 /* A bit of a hack. We force the DRAP register to be saved in
9307 the re-aligned stack frame, which provides us with a copy
9308 of the CFA that will last past the prologue. Install it. */
9309 gcc_checking_assert (cfun->machine->fs.fp_valid);
9310 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9311 cfun->machine->fs.fp_offset - cfa_offset);
9312 mem = gen_rtx_MEM (mode, addr);
9313 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9315 else
9317 /* The frame pointer is a stable reference within the
9318 aligned frame. Use it. */
9319 gcc_checking_assert (cfun->machine->fs.fp_valid);
9320 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9321 cfun->machine->fs.fp_offset - cfa_offset);
9322 mem = gen_rtx_MEM (mode, addr);
9323 add_reg_note (insn, REG_CFA_EXPRESSION,
9324 gen_rtx_SET (VOIDmode, mem, reg));
9328 /* The memory may not be relative to the current CFA register,
9329 which means that we may need to generate a new pattern for
9330 use by the unwind info. */
9331 else if (base != m->fs.cfa_reg)
9333 addr = plus_constant (Pmode, m->fs.cfa_reg,
9334 m->fs.cfa_offset - cfa_offset);
9335 mem = gen_rtx_MEM (mode, addr);
9336 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9340 /* Emit code to save registers using MOV insns.
9341 First register is stored at CFA - CFA_OFFSET. */
9342 static void
9343 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9345 unsigned int regno;
9347 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9348 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9350 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
9351 cfa_offset -= UNITS_PER_WORD;
9355 /* Emit code to save SSE registers using MOV insns.
9356 First register is stored at CFA - CFA_OFFSET. */
9357 static void
9358 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9360 unsigned int regno;
9362 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9363 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9365 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9366 cfa_offset -= 16;
9370 static GTY(()) rtx queued_cfa_restores;
9372 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9373 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
9374 Don't add the note if the previously saved value will be left untouched
9375 within stack red-zone till return, as unwinders can find the same value
9376 in the register and on the stack. */
9378 static void
9379 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9381 if (!crtl->shrink_wrapped
9382 && cfa_offset <= cfun->machine->fs.red_zone_offset)
9383 return;
9385 if (insn)
9387 add_reg_note (insn, REG_CFA_RESTORE, reg);
9388 RTX_FRAME_RELATED_P (insn) = 1;
9390 else
9391 queued_cfa_restores
9392 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9395 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
9397 static void
9398 ix86_add_queued_cfa_restore_notes (rtx insn)
9400 rtx last;
9401 if (!queued_cfa_restores)
9402 return;
9403 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9405 XEXP (last, 1) = REG_NOTES (insn);
9406 REG_NOTES (insn) = queued_cfa_restores;
9407 queued_cfa_restores = NULL_RTX;
9408 RTX_FRAME_RELATED_P (insn) = 1;
9411 /* Expand prologue or epilogue stack adjustment.
9412 The pattern exist to put a dependency on all ebp-based memory accesses.
9413 STYLE should be negative if instructions should be marked as frame related,
9414 zero if %r11 register is live and cannot be freely used and positive
9415 otherwise. */
9417 static void
9418 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9419 int style, bool set_cfa)
9421 struct machine_function *m = cfun->machine;
9422 rtx insn;
9423 bool add_frame_related_expr = false;
9425 if (Pmode == SImode)
9426 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9427 else if (x86_64_immediate_operand (offset, DImode))
9428 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9429 else
9431 rtx tmp;
9432 /* r11 is used by indirect sibcall return as well, set before the
9433 epilogue and used after the epilogue. */
9434 if (style)
9435 tmp = gen_rtx_REG (DImode, R11_REG);
9436 else
9438 gcc_assert (src != hard_frame_pointer_rtx
9439 && dest != hard_frame_pointer_rtx);
9440 tmp = hard_frame_pointer_rtx;
9442 insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9443 if (style < 0)
9444 add_frame_related_expr = true;
9446 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9449 insn = emit_insn (insn);
9450 if (style >= 0)
9451 ix86_add_queued_cfa_restore_notes (insn);
9453 if (set_cfa)
9455 rtx r;
9457 gcc_assert (m->fs.cfa_reg == src);
9458 m->fs.cfa_offset += INTVAL (offset);
9459 m->fs.cfa_reg = dest;
9461 r = gen_rtx_PLUS (Pmode, src, offset);
9462 r = gen_rtx_SET (VOIDmode, dest, r);
9463 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9464 RTX_FRAME_RELATED_P (insn) = 1;
9466 else if (style < 0)
9468 RTX_FRAME_RELATED_P (insn) = 1;
9469 if (add_frame_related_expr)
9471 rtx r = gen_rtx_PLUS (Pmode, src, offset);
9472 r = gen_rtx_SET (VOIDmode, dest, r);
9473 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9477 if (dest == stack_pointer_rtx)
9479 HOST_WIDE_INT ooffset = m->fs.sp_offset;
9480 bool valid = m->fs.sp_valid;
9482 if (src == hard_frame_pointer_rtx)
9484 valid = m->fs.fp_valid;
9485 ooffset = m->fs.fp_offset;
9487 else if (src == crtl->drap_reg)
9489 valid = m->fs.drap_valid;
9490 ooffset = 0;
9492 else
9494 /* Else there are two possibilities: SP itself, which we set
9495 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
9496 taken care of this by hand along the eh_return path. */
9497 gcc_checking_assert (src == stack_pointer_rtx
9498 || offset == const0_rtx);
9501 m->fs.sp_offset = ooffset - INTVAL (offset);
9502 m->fs.sp_valid = valid;
9506 /* Find an available register to be used as dynamic realign argument
9507 pointer regsiter. Such a register will be written in prologue and
9508 used in begin of body, so it must not be
9509 1. parameter passing register.
9510 2. GOT pointer.
9511 We reuse static-chain register if it is available. Otherwise, we
9512 use DI for i386 and R13 for x86-64. We chose R13 since it has
9513 shorter encoding.
9515 Return: the regno of chosen register. */
9517 static unsigned int
9518 find_drap_reg (void)
9520 tree decl = cfun->decl;
9522 if (TARGET_64BIT)
9524 /* Use R13 for nested function or function need static chain.
9525 Since function with tail call may use any caller-saved
9526 registers in epilogue, DRAP must not use caller-saved
9527 register in such case. */
9528 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9529 return R13_REG;
9531 return R10_REG;
9533 else
9535 /* Use DI for nested function or function need static chain.
9536 Since function with tail call may use any caller-saved
9537 registers in epilogue, DRAP must not use caller-saved
9538 register in such case. */
9539 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9540 return DI_REG;
9542 /* Reuse static chain register if it isn't used for parameter
9543 passing. */
9544 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9546 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9547 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9548 return CX_REG;
9550 return DI_REG;
9554 /* Return minimum incoming stack alignment. */
9556 static unsigned int
9557 ix86_minimum_incoming_stack_boundary (bool sibcall)
9559 unsigned int incoming_stack_boundary;
9561 /* Prefer the one specified at command line. */
9562 if (ix86_user_incoming_stack_boundary)
9563 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9564 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9565 if -mstackrealign is used, it isn't used for sibcall check and
9566 estimated stack alignment is 128bit. */
9567 else if (!sibcall
9568 && !TARGET_64BIT
9569 && ix86_force_align_arg_pointer
9570 && crtl->stack_alignment_estimated == 128)
9571 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9572 else
9573 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9575 /* Incoming stack alignment can be changed on individual functions
9576 via force_align_arg_pointer attribute. We use the smallest
9577 incoming stack boundary. */
9578 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9579 && lookup_attribute (ix86_force_align_arg_pointer_string,
9580 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9581 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9583 /* The incoming stack frame has to be aligned at least at
9584 parm_stack_boundary. */
9585 if (incoming_stack_boundary < crtl->parm_stack_boundary)
9586 incoming_stack_boundary = crtl->parm_stack_boundary;
9588 /* Stack at entrance of main is aligned by runtime. We use the
9589 smallest incoming stack boundary. */
9590 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9591 && DECL_NAME (current_function_decl)
9592 && MAIN_NAME_P (DECL_NAME (current_function_decl))
9593 && DECL_FILE_SCOPE_P (current_function_decl))
9594 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9596 return incoming_stack_boundary;
9599 /* Update incoming stack boundary and estimated stack alignment. */
9601 static void
9602 ix86_update_stack_boundary (void)
9604 ix86_incoming_stack_boundary
9605 = ix86_minimum_incoming_stack_boundary (false);
9607 /* x86_64 vararg needs 16byte stack alignment for register save
9608 area. */
9609 if (TARGET_64BIT
9610 && cfun->stdarg
9611 && crtl->stack_alignment_estimated < 128)
9612 crtl->stack_alignment_estimated = 128;
9615 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
9616 needed or an rtx for DRAP otherwise. */
9618 static rtx
9619 ix86_get_drap_rtx (void)
9621 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
9622 crtl->need_drap = true;
9624 if (stack_realign_drap)
9626 /* Assign DRAP to vDRAP and returns vDRAP */
9627 unsigned int regno = find_drap_reg ();
9628 rtx drap_vreg;
9629 rtx arg_ptr;
9630 rtx seq, insn;
9632 arg_ptr = gen_rtx_REG (Pmode, regno);
9633 crtl->drap_reg = arg_ptr;
9635 start_sequence ();
9636 drap_vreg = copy_to_reg (arg_ptr);
9637 seq = get_insns ();
9638 end_sequence ();
9640 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
9641 if (!optimize)
9643 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
9644 RTX_FRAME_RELATED_P (insn) = 1;
9646 return drap_vreg;
9648 else
9649 return NULL;
9652 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
9654 static rtx
9655 ix86_internal_arg_pointer (void)
9657 return virtual_incoming_args_rtx;
9660 struct scratch_reg {
9661 rtx reg;
9662 bool saved;
9665 /* Return a short-lived scratch register for use on function entry.
9666 In 32-bit mode, it is valid only after the registers are saved
9667 in the prologue. This register must be released by means of
9668 release_scratch_register_on_entry once it is dead. */
9670 static void
9671 get_scratch_register_on_entry (struct scratch_reg *sr)
9673 int regno;
9675 sr->saved = false;
9677 if (TARGET_64BIT)
9679 /* We always use R11 in 64-bit mode. */
9680 regno = R11_REG;
9682 else
9684 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
9685 bool fastcall_p
9686 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9687 bool thiscall_p
9688 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9689 bool static_chain_p = DECL_STATIC_CHAIN (decl);
9690 int regparm = ix86_function_regparm (fntype, decl);
9691 int drap_regno
9692 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
9694 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
9695 for the static chain register. */
9696 if ((regparm < 1 || (fastcall_p && !static_chain_p))
9697 && drap_regno != AX_REG)
9698 regno = AX_REG;
9699 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
9700 for the static chain register. */
9701 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
9702 regno = AX_REG;
9703 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
9704 regno = DX_REG;
9705 /* ecx is the static chain register. */
9706 else if (regparm < 3 && !fastcall_p && !thiscall_p
9707 && !static_chain_p
9708 && drap_regno != CX_REG)
9709 regno = CX_REG;
9710 else if (ix86_save_reg (BX_REG, true))
9711 regno = BX_REG;
9712 /* esi is the static chain register. */
9713 else if (!(regparm == 3 && static_chain_p)
9714 && ix86_save_reg (SI_REG, true))
9715 regno = SI_REG;
9716 else if (ix86_save_reg (DI_REG, true))
9717 regno = DI_REG;
9718 else
9720 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
9721 sr->saved = true;
9725 sr->reg = gen_rtx_REG (Pmode, regno);
9726 if (sr->saved)
9728 rtx insn = emit_insn (gen_push (sr->reg));
9729 RTX_FRAME_RELATED_P (insn) = 1;
9733 /* Release a scratch register obtained from the preceding function. */
9735 static void
9736 release_scratch_register_on_entry (struct scratch_reg *sr)
9738 if (sr->saved)
9740 struct machine_function *m = cfun->machine;
9741 rtx x, insn = emit_insn (gen_pop (sr->reg));
9743 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
9744 RTX_FRAME_RELATED_P (insn) = 1;
9745 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
9746 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
9747 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
9748 m->fs.sp_offset -= UNITS_PER_WORD;
9752 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
9754 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
9756 static void
9757 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
9759 /* We skip the probe for the first interval + a small dope of 4 words and
9760 probe that many bytes past the specified size to maintain a protection
9761 area at the botton of the stack. */
9762 const int dope = 4 * UNITS_PER_WORD;
9763 rtx size_rtx = GEN_INT (size), last;
9765 /* See if we have a constant small number of probes to generate. If so,
9766 that's the easy case. The run-time loop is made up of 11 insns in the
9767 generic case while the compile-time loop is made up of 3+2*(n-1) insns
9768 for n # of intervals. */
9769 if (size <= 5 * PROBE_INTERVAL)
9771 HOST_WIDE_INT i, adjust;
9772 bool first_probe = true;
9774 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
9775 values of N from 1 until it exceeds SIZE. If only one probe is
9776 needed, this will not generate any code. Then adjust and probe
9777 to PROBE_INTERVAL + SIZE. */
9778 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9780 if (first_probe)
9782 adjust = 2 * PROBE_INTERVAL + dope;
9783 first_probe = false;
9785 else
9786 adjust = PROBE_INTERVAL;
9788 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9789 plus_constant (Pmode, stack_pointer_rtx,
9790 -adjust)));
9791 emit_stack_probe (stack_pointer_rtx);
9794 if (first_probe)
9795 adjust = size + PROBE_INTERVAL + dope;
9796 else
9797 adjust = size + PROBE_INTERVAL - i;
9799 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9800 plus_constant (Pmode, stack_pointer_rtx,
9801 -adjust)));
9802 emit_stack_probe (stack_pointer_rtx);
9804 /* Adjust back to account for the additional first interval. */
9805 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9806 plus_constant (Pmode, stack_pointer_rtx,
9807 PROBE_INTERVAL + dope)));
9810 /* Otherwise, do the same as above, but in a loop. Note that we must be
9811 extra careful with variables wrapping around because we might be at
9812 the very top (or the very bottom) of the address space and we have
9813 to be able to handle this case properly; in particular, we use an
9814 equality test for the loop condition. */
9815 else
9817 HOST_WIDE_INT rounded_size;
9818 struct scratch_reg sr;
9820 get_scratch_register_on_entry (&sr);
9823 /* Step 1: round SIZE to the previous multiple of the interval. */
9825 rounded_size = size & -PROBE_INTERVAL;
9828 /* Step 2: compute initial and final value of the loop counter. */
9830 /* SP = SP_0 + PROBE_INTERVAL. */
9831 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9832 plus_constant (Pmode, stack_pointer_rtx,
9833 - (PROBE_INTERVAL + dope))));
9835 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
9836 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
9837 emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
9838 gen_rtx_PLUS (Pmode, sr.reg,
9839 stack_pointer_rtx)));
9842 /* Step 3: the loop
9844 while (SP != LAST_ADDR)
9846 SP = SP + PROBE_INTERVAL
9847 probe at SP
9850 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
9851 values of N from 1 until it is equal to ROUNDED_SIZE. */
9853 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
9856 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
9857 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
9859 if (size != rounded_size)
9861 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9862 plus_constant (Pmode, stack_pointer_rtx,
9863 rounded_size - size)));
9864 emit_stack_probe (stack_pointer_rtx);
9867 /* Adjust back to account for the additional first interval. */
9868 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9869 plus_constant (Pmode, stack_pointer_rtx,
9870 PROBE_INTERVAL + dope)));
9872 release_scratch_register_on_entry (&sr);
9875 gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
9877 /* Even if the stack pointer isn't the CFA register, we need to correctly
9878 describe the adjustments made to it, in particular differentiate the
9879 frame-related ones from the frame-unrelated ones. */
9880 if (size > 0)
9882 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
9883 XVECEXP (expr, 0, 0)
9884 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9885 plus_constant (Pmode, stack_pointer_rtx, -size));
9886 XVECEXP (expr, 0, 1)
9887 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9888 plus_constant (Pmode, stack_pointer_rtx,
9889 PROBE_INTERVAL + dope + size));
9890 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
9891 RTX_FRAME_RELATED_P (last) = 1;
9893 cfun->machine->fs.sp_offset += size;
9896 /* Make sure nothing is scheduled before we are done. */
9897 emit_insn (gen_blockage ());
9900 /* Adjust the stack pointer up to REG while probing it. */
9902 const char *
9903 output_adjust_stack_and_probe (rtx reg)
9905 static int labelno = 0;
9906 char loop_lab[32], end_lab[32];
9907 rtx xops[2];
9909 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9910 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9912 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9914 /* Jump to END_LAB if SP == LAST_ADDR. */
9915 xops[0] = stack_pointer_rtx;
9916 xops[1] = reg;
9917 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9918 fputs ("\tje\t", asm_out_file);
9919 assemble_name_raw (asm_out_file, end_lab);
9920 fputc ('\n', asm_out_file);
9922 /* SP = SP + PROBE_INTERVAL. */
9923 xops[1] = GEN_INT (PROBE_INTERVAL);
9924 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9926 /* Probe at SP. */
9927 xops[1] = const0_rtx;
9928 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
9930 fprintf (asm_out_file, "\tjmp\t");
9931 assemble_name_raw (asm_out_file, loop_lab);
9932 fputc ('\n', asm_out_file);
9934 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9936 return "";
9939 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
9940 inclusive. These are offsets from the current stack pointer. */
9942 static void
9943 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
9945 /* See if we have a constant small number of probes to generate. If so,
9946 that's the easy case. The run-time loop is made up of 7 insns in the
9947 generic case while the compile-time loop is made up of n insns for n #
9948 of intervals. */
9949 if (size <= 7 * PROBE_INTERVAL)
9951 HOST_WIDE_INT i;
9953 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
9954 it exceeds SIZE. If only one probe is needed, this will not
9955 generate any code. Then probe at FIRST + SIZE. */
9956 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9957 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9958 -(first + i)));
9960 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9961 -(first + size)));
9964 /* Otherwise, do the same as above, but in a loop. Note that we must be
9965 extra careful with variables wrapping around because we might be at
9966 the very top (or the very bottom) of the address space and we have
9967 to be able to handle this case properly; in particular, we use an
9968 equality test for the loop condition. */
9969 else
9971 HOST_WIDE_INT rounded_size, last;
9972 struct scratch_reg sr;
9974 get_scratch_register_on_entry (&sr);
9977 /* Step 1: round SIZE to the previous multiple of the interval. */
9979 rounded_size = size & -PROBE_INTERVAL;
9982 /* Step 2: compute initial and final value of the loop counter. */
9984 /* TEST_OFFSET = FIRST. */
9985 emit_move_insn (sr.reg, GEN_INT (-first));
9987 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
9988 last = first + rounded_size;
9991 /* Step 3: the loop
9993 while (TEST_ADDR != LAST_ADDR)
9995 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
9996 probe at TEST_ADDR
9999 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
10000 until it is equal to ROUNDED_SIZE. */
10002 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
10005 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
10006 that SIZE is equal to ROUNDED_SIZE. */
10008 if (size != rounded_size)
10009 emit_stack_probe (plus_constant (Pmode,
10010 gen_rtx_PLUS (Pmode,
10011 stack_pointer_rtx,
10012 sr.reg),
10013 rounded_size - size));
10015 release_scratch_register_on_entry (&sr);
10018 /* Make sure nothing is scheduled before we are done. */
10019 emit_insn (gen_blockage ());
10022 /* Probe a range of stack addresses from REG to END, inclusive. These are
10023 offsets from the current stack pointer. */
10025 const char *
10026 output_probe_stack_range (rtx reg, rtx end)
10028 static int labelno = 0;
10029 char loop_lab[32], end_lab[32];
10030 rtx xops[3];
10032 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10033 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10035 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10037 /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */
10038 xops[0] = reg;
10039 xops[1] = end;
10040 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10041 fputs ("\tje\t", asm_out_file);
10042 assemble_name_raw (asm_out_file, end_lab);
10043 fputc ('\n', asm_out_file);
10045 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
10046 xops[1] = GEN_INT (PROBE_INTERVAL);
10047 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10049 /* Probe at TEST_ADDR. */
10050 xops[0] = stack_pointer_rtx;
10051 xops[1] = reg;
10052 xops[2] = const0_rtx;
10053 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
10055 fprintf (asm_out_file, "\tjmp\t");
10056 assemble_name_raw (asm_out_file, loop_lab);
10057 fputc ('\n', asm_out_file);
10059 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10061 return "";
10064 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
10065 to be generated in correct form. */
10066 static void
10067 ix86_finalize_stack_realign_flags (void)
10069 /* Check if stack realign is really needed after reload, and
10070 stores result in cfun */
10071 unsigned int incoming_stack_boundary
10072 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
10073 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
10074 unsigned int stack_realign = (incoming_stack_boundary
10075 < (crtl->is_leaf
10076 ? crtl->max_used_stack_slot_alignment
10077 : crtl->stack_alignment_needed));
10079 if (crtl->stack_realign_finalized)
10081 /* After stack_realign_needed is finalized, we can't no longer
10082 change it. */
10083 gcc_assert (crtl->stack_realign_needed == stack_realign);
10084 return;
10087 /* If the only reason for frame_pointer_needed is that we conservatively
10088 assumed stack realignment might be needed, but in the end nothing that
10089 needed the stack alignment had been spilled, clear frame_pointer_needed
10090 and say we don't need stack realignment. */
10091 if (stack_realign
10092 && !crtl->need_drap
10093 && frame_pointer_needed
10094 && crtl->is_leaf
10095 && flag_omit_frame_pointer
10096 && crtl->sp_is_unchanging
10097 && !ix86_current_function_calls_tls_descriptor
10098 && !crtl->accesses_prior_frames
10099 && !cfun->calls_alloca
10100 && !crtl->calls_eh_return
10101 && !(flag_stack_check && STACK_CHECK_MOVING_SP)
10102 && !ix86_frame_pointer_required ()
10103 && get_frame_size () == 0
10104 && ix86_nsaved_sseregs () == 0
10105 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
10107 HARD_REG_SET set_up_by_prologue, prologue_used;
10108 basic_block bb;
10110 CLEAR_HARD_REG_SET (prologue_used);
10111 CLEAR_HARD_REG_SET (set_up_by_prologue);
10112 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
10113 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
10114 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
10115 HARD_FRAME_POINTER_REGNUM);
10116 FOR_EACH_BB (bb)
10118 rtx insn;
10119 FOR_BB_INSNS (bb, insn)
10120 if (NONDEBUG_INSN_P (insn)
10121 && requires_stack_frame_p (insn, prologue_used,
10122 set_up_by_prologue))
10124 crtl->stack_realign_needed = stack_realign;
10125 crtl->stack_realign_finalized = true;
10126 return;
10130 frame_pointer_needed = false;
10131 stack_realign = false;
10132 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
10133 crtl->stack_alignment_needed = incoming_stack_boundary;
10134 crtl->stack_alignment_estimated = incoming_stack_boundary;
10135 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
10136 crtl->preferred_stack_boundary = incoming_stack_boundary;
10137 df_finish_pass (true);
10138 df_scan_alloc (NULL);
10139 df_scan_blocks ();
10140 df_compute_regs_ever_live (true);
10141 df_analyze ();
10144 crtl->stack_realign_needed = stack_realign;
10145 crtl->stack_realign_finalized = true;
10148 /* Expand the prologue into a bunch of separate insns. */
10150 void
10151 ix86_expand_prologue (void)
10153 struct machine_function *m = cfun->machine;
10154 rtx insn, t;
10155 bool pic_reg_used;
10156 struct ix86_frame frame;
10157 HOST_WIDE_INT allocate;
10158 bool int_registers_saved;
10159 bool sse_registers_saved;
10161 ix86_finalize_stack_realign_flags ();
10163 /* DRAP should not coexist with stack_realign_fp */
10164 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
10166 memset (&m->fs, 0, sizeof (m->fs));
10168 /* Initialize CFA state for before the prologue. */
10169 m->fs.cfa_reg = stack_pointer_rtx;
10170 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
10172 /* Track SP offset to the CFA. We continue tracking this after we've
10173 swapped the CFA register away from SP. In the case of re-alignment
10174 this is fudged; we're interested to offsets within the local frame. */
10175 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10176 m->fs.sp_valid = true;
10178 ix86_compute_frame_layout (&frame);
10180 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
10182 /* We should have already generated an error for any use of
10183 ms_hook on a nested function. */
10184 gcc_checking_assert (!ix86_static_chain_on_stack);
10186 /* Check if profiling is active and we shall use profiling before
10187 prologue variant. If so sorry. */
10188 if (crtl->profile && flag_fentry != 0)
10189 sorry ("ms_hook_prologue attribute isn%'t compatible "
10190 "with -mfentry for 32-bit");
10192 /* In ix86_asm_output_function_label we emitted:
10193 8b ff movl.s %edi,%edi
10194 55 push %ebp
10195 8b ec movl.s %esp,%ebp
10197 This matches the hookable function prologue in Win32 API
10198 functions in Microsoft Windows XP Service Pack 2 and newer.
10199 Wine uses this to enable Windows apps to hook the Win32 API
10200 functions provided by Wine.
10202 What that means is that we've already set up the frame pointer. */
10204 if (frame_pointer_needed
10205 && !(crtl->drap_reg && crtl->stack_realign_needed))
10207 rtx push, mov;
10209 /* We've decided to use the frame pointer already set up.
10210 Describe this to the unwinder by pretending that both
10211 push and mov insns happen right here.
10213 Putting the unwind info here at the end of the ms_hook
10214 is done so that we can make absolutely certain we get
10215 the required byte sequence at the start of the function,
10216 rather than relying on an assembler that can produce
10217 the exact encoding required.
10219 However it does mean (in the unpatched case) that we have
10220 a 1 insn window where the asynchronous unwind info is
10221 incorrect. However, if we placed the unwind info at
10222 its correct location we would have incorrect unwind info
10223 in the patched case. Which is probably all moot since
10224 I don't expect Wine generates dwarf2 unwind info for the
10225 system libraries that use this feature. */
10227 insn = emit_insn (gen_blockage ());
10229 push = gen_push (hard_frame_pointer_rtx);
10230 mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
10231 stack_pointer_rtx);
10232 RTX_FRAME_RELATED_P (push) = 1;
10233 RTX_FRAME_RELATED_P (mov) = 1;
10235 RTX_FRAME_RELATED_P (insn) = 1;
10236 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10237 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
10239 /* Note that gen_push incremented m->fs.cfa_offset, even
10240 though we didn't emit the push insn here. */
10241 m->fs.cfa_reg = hard_frame_pointer_rtx;
10242 m->fs.fp_offset = m->fs.cfa_offset;
10243 m->fs.fp_valid = true;
10245 else
10247 /* The frame pointer is not needed so pop %ebp again.
10248 This leaves us with a pristine state. */
10249 emit_insn (gen_pop (hard_frame_pointer_rtx));
10253 /* The first insn of a function that accepts its static chain on the
10254 stack is to push the register that would be filled in by a direct
10255 call. This insn will be skipped by the trampoline. */
10256 else if (ix86_static_chain_on_stack)
10258 insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
10259 emit_insn (gen_blockage ());
10261 /* We don't want to interpret this push insn as a register save,
10262 only as a stack adjustment. The real copy of the register as
10263 a save will be done later, if needed. */
10264 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
10265 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
10266 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10267 RTX_FRAME_RELATED_P (insn) = 1;
10270 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10271 of DRAP is needed and stack realignment is really needed after reload */
10272 if (stack_realign_drap)
10274 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10276 /* Only need to push parameter pointer reg if it is caller saved. */
10277 if (!call_used_regs[REGNO (crtl->drap_reg)])
10279 /* Push arg pointer reg */
10280 insn = emit_insn (gen_push (crtl->drap_reg));
10281 RTX_FRAME_RELATED_P (insn) = 1;
10284 /* Grab the argument pointer. */
10285 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
10286 insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10287 RTX_FRAME_RELATED_P (insn) = 1;
10288 m->fs.cfa_reg = crtl->drap_reg;
10289 m->fs.cfa_offset = 0;
10291 /* Align the stack. */
10292 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10293 stack_pointer_rtx,
10294 GEN_INT (-align_bytes)));
10295 RTX_FRAME_RELATED_P (insn) = 1;
10297 /* Replicate the return address on the stack so that return
10298 address can be reached via (argp - 1) slot. This is needed
10299 to implement macro RETURN_ADDR_RTX and intrinsic function
10300 expand_builtin_return_addr etc. */
10301 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
10302 t = gen_frame_mem (word_mode, t);
10303 insn = emit_insn (gen_push (t));
10304 RTX_FRAME_RELATED_P (insn) = 1;
10306 /* For the purposes of frame and register save area addressing,
10307 we've started over with a new frame. */
10308 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10309 m->fs.realigned = true;
10312 int_registers_saved = (frame.nregs == 0);
10313 sse_registers_saved = (frame.nsseregs == 0);
10315 if (frame_pointer_needed && !m->fs.fp_valid)
10317 /* Note: AT&T enter does NOT have reversed args. Enter is probably
10318 slower on all targets. Also sdb doesn't like it. */
10319 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10320 RTX_FRAME_RELATED_P (insn) = 1;
10322 /* Push registers now, before setting the frame pointer
10323 on SEH target. */
10324 if (!int_registers_saved
10325 && TARGET_SEH
10326 && !frame.save_regs_using_mov)
10328 ix86_emit_save_regs ();
10329 int_registers_saved = true;
10330 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10333 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10335 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10336 RTX_FRAME_RELATED_P (insn) = 1;
10338 if (m->fs.cfa_reg == stack_pointer_rtx)
10339 m->fs.cfa_reg = hard_frame_pointer_rtx;
10340 m->fs.fp_offset = m->fs.sp_offset;
10341 m->fs.fp_valid = true;
10345 if (!int_registers_saved)
10347 /* If saving registers via PUSH, do so now. */
10348 if (!frame.save_regs_using_mov)
10350 ix86_emit_save_regs ();
10351 int_registers_saved = true;
10352 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10355 /* When using red zone we may start register saving before allocating
10356 the stack frame saving one cycle of the prologue. However, avoid
10357 doing this if we have to probe the stack; at least on x86_64 the
10358 stack probe can turn into a call that clobbers a red zone location. */
10359 else if (ix86_using_red_zone ()
10360 && (! TARGET_STACK_PROBE
10361 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10363 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10364 int_registers_saved = true;
10368 if (stack_realign_fp)
10370 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10371 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10373 /* The computation of the size of the re-aligned stack frame means
10374 that we must allocate the size of the register save area before
10375 performing the actual alignment. Otherwise we cannot guarantee
10376 that there's enough storage above the realignment point. */
10377 if (m->fs.sp_offset != frame.sse_reg_save_offset)
10378 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10379 GEN_INT (m->fs.sp_offset
10380 - frame.sse_reg_save_offset),
10381 -1, false);
10383 /* Align the stack. */
10384 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10385 stack_pointer_rtx,
10386 GEN_INT (-align_bytes)));
10388 /* For the purposes of register save area addressing, the stack
10389 pointer is no longer valid. As for the value of sp_offset,
10390 see ix86_compute_frame_layout, which we need to match in order
10391 to pass verification of stack_pointer_offset at the end. */
10392 m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10393 m->fs.sp_valid = false;
10396 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10398 if (flag_stack_usage_info)
10400 /* We start to count from ARG_POINTER. */
10401 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10403 /* If it was realigned, take into account the fake frame. */
10404 if (stack_realign_drap)
10406 if (ix86_static_chain_on_stack)
10407 stack_size += UNITS_PER_WORD;
10409 if (!call_used_regs[REGNO (crtl->drap_reg)])
10410 stack_size += UNITS_PER_WORD;
10412 /* This over-estimates by 1 minimal-stack-alignment-unit but
10413 mitigates that by counting in the new return address slot. */
10414 current_function_dynamic_stack_size
10415 += crtl->stack_alignment_needed / BITS_PER_UNIT;
10418 current_function_static_stack_size = stack_size;
10421 /* On SEH target with very large frame size, allocate an area to save
10422 SSE registers (as the very large allocation won't be described). */
10423 if (TARGET_SEH
10424 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
10425 && !sse_registers_saved)
10427 HOST_WIDE_INT sse_size =
10428 frame.sse_reg_save_offset - frame.reg_save_offset;
10430 gcc_assert (int_registers_saved);
10432 /* No need to do stack checking as the area will be immediately
10433 written. */
10434 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10435 GEN_INT (-sse_size), -1,
10436 m->fs.cfa_reg == stack_pointer_rtx);
10437 allocate -= sse_size;
10438 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10439 sse_registers_saved = true;
10442 /* The stack has already been decremented by the instruction calling us
10443 so probe if the size is non-negative to preserve the protection area. */
10444 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10446 /* We expect the registers to be saved when probes are used. */
10447 gcc_assert (int_registers_saved);
10449 if (STACK_CHECK_MOVING_SP)
10451 ix86_adjust_stack_and_probe (allocate);
10452 allocate = 0;
10454 else
10456 HOST_WIDE_INT size = allocate;
10458 if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10459 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10461 if (TARGET_STACK_PROBE)
10462 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10463 else
10464 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10468 if (allocate == 0)
10470 else if (!ix86_target_stack_probe ()
10471 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10473 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10474 GEN_INT (-allocate), -1,
10475 m->fs.cfa_reg == stack_pointer_rtx);
10477 else
10479 rtx eax = gen_rtx_REG (Pmode, AX_REG);
10480 rtx r10 = NULL;
10481 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10482 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
10483 bool eax_live = false;
10484 bool r10_live = false;
10486 if (TARGET_64BIT)
10487 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10488 if (!TARGET_64BIT_MS_ABI)
10489 eax_live = ix86_eax_live_at_start_p ();
10491 /* Note that SEH directives need to continue tracking the stack
10492 pointer even after the frame pointer has been set up. */
10493 if (eax_live)
10495 insn = emit_insn (gen_push (eax));
10496 allocate -= UNITS_PER_WORD;
10497 if (sp_is_cfa_reg || TARGET_SEH)
10499 if (sp_is_cfa_reg)
10500 m->fs.cfa_offset += UNITS_PER_WORD;
10501 RTX_FRAME_RELATED_P (insn) = 1;
10505 if (r10_live)
10507 r10 = gen_rtx_REG (Pmode, R10_REG);
10508 insn = emit_insn (gen_push (r10));
10509 allocate -= UNITS_PER_WORD;
10510 if (sp_is_cfa_reg || TARGET_SEH)
10512 if (sp_is_cfa_reg)
10513 m->fs.cfa_offset += UNITS_PER_WORD;
10514 RTX_FRAME_RELATED_P (insn) = 1;
10518 emit_move_insn (eax, GEN_INT (allocate));
10519 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10521 /* Use the fact that AX still contains ALLOCATE. */
10522 adjust_stack_insn = (Pmode == DImode
10523 ? gen_pro_epilogue_adjust_stack_di_sub
10524 : gen_pro_epilogue_adjust_stack_si_sub);
10526 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10527 stack_pointer_rtx, eax));
10529 if (sp_is_cfa_reg || TARGET_SEH)
10531 if (sp_is_cfa_reg)
10532 m->fs.cfa_offset += allocate;
10533 RTX_FRAME_RELATED_P (insn) = 1;
10534 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10535 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10536 plus_constant (Pmode, stack_pointer_rtx,
10537 -allocate)));
10539 m->fs.sp_offset += allocate;
10541 if (r10_live && eax_live)
10543 t = choose_baseaddr (m->fs.sp_offset - allocate);
10544 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
10545 gen_frame_mem (word_mode, t));
10546 t = choose_baseaddr (m->fs.sp_offset - allocate - UNITS_PER_WORD);
10547 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
10548 gen_frame_mem (word_mode, t));
10550 else if (eax_live || r10_live)
10552 t = choose_baseaddr (m->fs.sp_offset - allocate);
10553 emit_move_insn (gen_rtx_REG (word_mode,
10554 (eax_live ? AX_REG : R10_REG)),
10555 gen_frame_mem (word_mode, t));
10558 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10560 /* If we havn't already set up the frame pointer, do so now. */
10561 if (frame_pointer_needed && !m->fs.fp_valid)
10563 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10564 GEN_INT (frame.stack_pointer_offset
10565 - frame.hard_frame_pointer_offset));
10566 insn = emit_insn (insn);
10567 RTX_FRAME_RELATED_P (insn) = 1;
10568 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10570 if (m->fs.cfa_reg == stack_pointer_rtx)
10571 m->fs.cfa_reg = hard_frame_pointer_rtx;
10572 m->fs.fp_offset = frame.hard_frame_pointer_offset;
10573 m->fs.fp_valid = true;
10576 if (!int_registers_saved)
10577 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10578 if (!sse_registers_saved)
10579 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10581 pic_reg_used = false;
10582 if (pic_offset_table_rtx
10583 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10584 || crtl->profile))
10586 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
10588 if (alt_pic_reg_used != INVALID_REGNUM)
10589 SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
10591 pic_reg_used = true;
10594 if (pic_reg_used)
10596 if (TARGET_64BIT)
10598 if (ix86_cmodel == CM_LARGE_PIC)
10600 rtx label, tmp_reg;
10602 gcc_assert (Pmode == DImode);
10603 label = gen_label_rtx ();
10604 emit_label (label);
10605 LABEL_PRESERVE_P (label) = 1;
10606 tmp_reg = gen_rtx_REG (Pmode, R11_REG);
10607 gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
10608 insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
10609 label));
10610 insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
10611 insn = emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
10612 pic_offset_table_rtx, tmp_reg));
10614 else
10615 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
10617 else
10619 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
10620 RTX_FRAME_RELATED_P (insn) = 1;
10621 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
10625 /* In the pic_reg_used case, make sure that the got load isn't deleted
10626 when mcount needs it. Blockage to avoid call movement across mcount
10627 call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
10628 note. */
10629 if (crtl->profile && !flag_fentry && pic_reg_used)
10630 emit_insn (gen_prologue_use (pic_offset_table_rtx));
10632 if (crtl->drap_reg && !crtl->stack_realign_needed)
10634 /* vDRAP is setup but after reload it turns out stack realign
10635 isn't necessary, here we will emit prologue to setup DRAP
10636 without stack realign adjustment */
10637 t = choose_baseaddr (0);
10638 emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10641 /* Prevent instructions from being scheduled into register save push
10642 sequence when access to the redzone area is done through frame pointer.
10643 The offset between the frame pointer and the stack pointer is calculated
10644 relative to the value of the stack pointer at the end of the function
10645 prologue, and moving instructions that access redzone area via frame
10646 pointer inside push sequence violates this assumption. */
10647 if (frame_pointer_needed && frame.red_zone_size)
10648 emit_insn (gen_memory_blockage ());
10650 /* Emit cld instruction if stringops are used in the function. */
10651 if (TARGET_CLD && ix86_current_function_needs_cld)
10652 emit_insn (gen_cld ());
10654 /* SEH requires that the prologue end within 256 bytes of the start of
10655 the function. Prevent instruction schedules that would extend that.
10656 Further, prevent alloca modifications to the stack pointer from being
10657 combined with prologue modifications. */
10658 if (TARGET_SEH)
10659 emit_insn (gen_prologue_use (stack_pointer_rtx));
10662 /* Emit code to restore REG using a POP insn. */
10664 static void
10665 ix86_emit_restore_reg_using_pop (rtx reg)
10667 struct machine_function *m = cfun->machine;
10668 rtx insn = emit_insn (gen_pop (reg));
10670 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
10671 m->fs.sp_offset -= UNITS_PER_WORD;
10673 if (m->fs.cfa_reg == crtl->drap_reg
10674 && REGNO (reg) == REGNO (crtl->drap_reg))
10676 /* Previously we'd represented the CFA as an expression
10677 like *(%ebp - 8). We've just popped that value from
10678 the stack, which means we need to reset the CFA to
10679 the drap register. This will remain until we restore
10680 the stack pointer. */
10681 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10682 RTX_FRAME_RELATED_P (insn) = 1;
10684 /* This means that the DRAP register is valid for addressing too. */
10685 m->fs.drap_valid = true;
10686 return;
10689 if (m->fs.cfa_reg == stack_pointer_rtx)
10691 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
10692 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10693 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
10694 RTX_FRAME_RELATED_P (insn) = 1;
10696 m->fs.cfa_offset -= UNITS_PER_WORD;
10699 /* When the frame pointer is the CFA, and we pop it, we are
10700 swapping back to the stack pointer as the CFA. This happens
10701 for stack frames that don't allocate other data, so we assume
10702 the stack pointer is now pointing at the return address, i.e.
10703 the function entry state, which makes the offset be 1 word. */
10704 if (reg == hard_frame_pointer_rtx)
10706 m->fs.fp_valid = false;
10707 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10709 m->fs.cfa_reg = stack_pointer_rtx;
10710 m->fs.cfa_offset -= UNITS_PER_WORD;
10712 add_reg_note (insn, REG_CFA_DEF_CFA,
10713 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10714 GEN_INT (m->fs.cfa_offset)));
10715 RTX_FRAME_RELATED_P (insn) = 1;
10720 /* Emit code to restore saved registers using POP insns. */
10722 static void
10723 ix86_emit_restore_regs_using_pop (void)
10725 unsigned int regno;
10727 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10728 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
10729 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
10732 /* Emit code and notes for the LEAVE instruction. */
10734 static void
10735 ix86_emit_leave (void)
10737 struct machine_function *m = cfun->machine;
10738 rtx insn = emit_insn (ix86_gen_leave ());
10740 ix86_add_queued_cfa_restore_notes (insn);
10742 gcc_assert (m->fs.fp_valid);
10743 m->fs.sp_valid = true;
10744 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
10745 m->fs.fp_valid = false;
10747 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10749 m->fs.cfa_reg = stack_pointer_rtx;
10750 m->fs.cfa_offset = m->fs.sp_offset;
10752 add_reg_note (insn, REG_CFA_DEF_CFA,
10753 plus_constant (Pmode, stack_pointer_rtx,
10754 m->fs.sp_offset));
10755 RTX_FRAME_RELATED_P (insn) = 1;
10757 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
10758 m->fs.fp_offset);
10761 /* Emit code to restore saved registers using MOV insns.
10762 First register is restored from CFA - CFA_OFFSET. */
10763 static void
10764 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
10765 bool maybe_eh_return)
10767 struct machine_function *m = cfun->machine;
10768 unsigned int regno;
10770 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10771 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10773 rtx reg = gen_rtx_REG (word_mode, regno);
10774 rtx insn, mem;
10776 mem = choose_baseaddr (cfa_offset);
10777 mem = gen_frame_mem (word_mode, mem);
10778 insn = emit_move_insn (reg, mem);
10780 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
10782 /* Previously we'd represented the CFA as an expression
10783 like *(%ebp - 8). We've just popped that value from
10784 the stack, which means we need to reset the CFA to
10785 the drap register. This will remain until we restore
10786 the stack pointer. */
10787 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10788 RTX_FRAME_RELATED_P (insn) = 1;
10790 /* This means that the DRAP register is valid for addressing. */
10791 m->fs.drap_valid = true;
10793 else
10794 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10796 cfa_offset -= UNITS_PER_WORD;
10800 /* Emit code to restore saved registers using MOV insns.
10801 First register is restored from CFA - CFA_OFFSET. */
10802 static void
10803 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
10804 bool maybe_eh_return)
10806 unsigned int regno;
10808 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10809 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10811 rtx reg = gen_rtx_REG (V4SFmode, regno);
10812 rtx mem;
10814 mem = choose_baseaddr (cfa_offset);
10815 mem = gen_rtx_MEM (V4SFmode, mem);
10816 set_mem_align (mem, 128);
10817 emit_move_insn (reg, mem);
10819 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10821 cfa_offset -= 16;
10825 /* Restore function stack, frame, and registers. */
10827 void
10828 ix86_expand_epilogue (int style)
10830 struct machine_function *m = cfun->machine;
10831 struct machine_frame_state frame_state_save = m->fs;
10832 struct ix86_frame frame;
10833 bool restore_regs_via_mov;
10834 bool using_drap;
10836 ix86_finalize_stack_realign_flags ();
10837 ix86_compute_frame_layout (&frame);
10839 m->fs.sp_valid = (!frame_pointer_needed
10840 || (crtl->sp_is_unchanging
10841 && !stack_realign_fp));
10842 gcc_assert (!m->fs.sp_valid
10843 || m->fs.sp_offset == frame.stack_pointer_offset);
10845 /* The FP must be valid if the frame pointer is present. */
10846 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
10847 gcc_assert (!m->fs.fp_valid
10848 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
10850 /* We must have *some* valid pointer to the stack frame. */
10851 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
10853 /* The DRAP is never valid at this point. */
10854 gcc_assert (!m->fs.drap_valid);
10856 /* See the comment about red zone and frame
10857 pointer usage in ix86_expand_prologue. */
10858 if (frame_pointer_needed && frame.red_zone_size)
10859 emit_insn (gen_memory_blockage ());
10861 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
10862 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
10864 /* Determine the CFA offset of the end of the red-zone. */
10865 m->fs.red_zone_offset = 0;
10866 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
10868 /* The red-zone begins below the return address. */
10869 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
10871 /* When the register save area is in the aligned portion of
10872 the stack, determine the maximum runtime displacement that
10873 matches up with the aligned frame. */
10874 if (stack_realign_drap)
10875 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
10876 + UNITS_PER_WORD);
10879 /* Special care must be taken for the normal return case of a function
10880 using eh_return: the eax and edx registers are marked as saved, but
10881 not restored along this path. Adjust the save location to match. */
10882 if (crtl->calls_eh_return && style != 2)
10883 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
10885 /* EH_RETURN requires the use of moves to function properly. */
10886 if (crtl->calls_eh_return)
10887 restore_regs_via_mov = true;
10888 /* SEH requires the use of pops to identify the epilogue. */
10889 else if (TARGET_SEH)
10890 restore_regs_via_mov = false;
10891 /* If we're only restoring one register and sp is not valid then
10892 using a move instruction to restore the register since it's
10893 less work than reloading sp and popping the register. */
10894 else if (!m->fs.sp_valid && frame.nregs <= 1)
10895 restore_regs_via_mov = true;
10896 else if (TARGET_EPILOGUE_USING_MOVE
10897 && cfun->machine->use_fast_prologue_epilogue
10898 && (frame.nregs > 1
10899 || m->fs.sp_offset != frame.reg_save_offset))
10900 restore_regs_via_mov = true;
10901 else if (frame_pointer_needed
10902 && !frame.nregs
10903 && m->fs.sp_offset != frame.reg_save_offset)
10904 restore_regs_via_mov = true;
10905 else if (frame_pointer_needed
10906 && TARGET_USE_LEAVE
10907 && cfun->machine->use_fast_prologue_epilogue
10908 && frame.nregs == 1)
10909 restore_regs_via_mov = true;
10910 else
10911 restore_regs_via_mov = false;
10913 if (restore_regs_via_mov || frame.nsseregs)
10915 /* Ensure that the entire register save area is addressable via
10916 the stack pointer, if we will restore via sp. */
10917 if (TARGET_64BIT
10918 && m->fs.sp_offset > 0x7fffffff
10919 && !(m->fs.fp_valid || m->fs.drap_valid)
10920 && (frame.nsseregs + frame.nregs) != 0)
10922 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10923 GEN_INT (m->fs.sp_offset
10924 - frame.sse_reg_save_offset),
10925 style,
10926 m->fs.cfa_reg == stack_pointer_rtx);
10930 /* If there are any SSE registers to restore, then we have to do it
10931 via moves, since there's obviously no pop for SSE regs. */
10932 if (frame.nsseregs)
10933 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
10934 style == 2);
10936 if (restore_regs_via_mov)
10938 rtx t;
10940 if (frame.nregs)
10941 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
10943 /* eh_return epilogues need %ecx added to the stack pointer. */
10944 if (style == 2)
10946 rtx insn, sa = EH_RETURN_STACKADJ_RTX;
10948 /* Stack align doesn't work with eh_return. */
10949 gcc_assert (!stack_realign_drap);
10950 /* Neither does regparm nested functions. */
10951 gcc_assert (!ix86_static_chain_on_stack);
10953 if (frame_pointer_needed)
10955 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
10956 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
10957 emit_insn (gen_rtx_SET (VOIDmode, sa, t));
10959 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
10960 insn = emit_move_insn (hard_frame_pointer_rtx, t);
10962 /* Note that we use SA as a temporary CFA, as the return
10963 address is at the proper place relative to it. We
10964 pretend this happens at the FP restore insn because
10965 prior to this insn the FP would be stored at the wrong
10966 offset relative to SA, and after this insn we have no
10967 other reasonable register to use for the CFA. We don't
10968 bother resetting the CFA to the SP for the duration of
10969 the return insn. */
10970 add_reg_note (insn, REG_CFA_DEF_CFA,
10971 plus_constant (Pmode, sa, UNITS_PER_WORD));
10972 ix86_add_queued_cfa_restore_notes (insn);
10973 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
10974 RTX_FRAME_RELATED_P (insn) = 1;
10976 m->fs.cfa_reg = sa;
10977 m->fs.cfa_offset = UNITS_PER_WORD;
10978 m->fs.fp_valid = false;
10980 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
10981 const0_rtx, style, false);
10983 else
10985 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
10986 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
10987 insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
10988 ix86_add_queued_cfa_restore_notes (insn);
10990 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
10991 if (m->fs.cfa_offset != UNITS_PER_WORD)
10993 m->fs.cfa_offset = UNITS_PER_WORD;
10994 add_reg_note (insn, REG_CFA_DEF_CFA,
10995 plus_constant (Pmode, stack_pointer_rtx,
10996 UNITS_PER_WORD));
10997 RTX_FRAME_RELATED_P (insn) = 1;
11000 m->fs.sp_offset = UNITS_PER_WORD;
11001 m->fs.sp_valid = true;
11004 else
11006 /* SEH requires that the function end with (1) a stack adjustment
11007 if necessary, (2) a sequence of pops, and (3) a return or
11008 jump instruction. Prevent insns from the function body from
11009 being scheduled into this sequence. */
11010 if (TARGET_SEH)
11012 /* Prevent a catch region from being adjacent to the standard
11013 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
11014 several other flags that would be interesting to test are
11015 not yet set up. */
11016 if (flag_non_call_exceptions)
11017 emit_insn (gen_nops (const1_rtx));
11018 else
11019 emit_insn (gen_blockage ());
11022 /* First step is to deallocate the stack frame so that we can
11023 pop the registers. Also do it on SEH target for very large
11024 frame as the emitted instructions aren't allowed by the ABI in
11025 epilogues. */
11026 if (!m->fs.sp_valid
11027 || (TARGET_SEH
11028 && (m->fs.sp_offset - frame.reg_save_offset
11029 >= SEH_MAX_FRAME_SIZE)))
11031 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
11032 GEN_INT (m->fs.fp_offset
11033 - frame.reg_save_offset),
11034 style, false);
11036 else if (m->fs.sp_offset != frame.reg_save_offset)
11038 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11039 GEN_INT (m->fs.sp_offset
11040 - frame.reg_save_offset),
11041 style,
11042 m->fs.cfa_reg == stack_pointer_rtx);
11045 ix86_emit_restore_regs_using_pop ();
11048 /* If we used a stack pointer and haven't already got rid of it,
11049 then do so now. */
11050 if (m->fs.fp_valid)
11052 /* If the stack pointer is valid and pointing at the frame
11053 pointer store address, then we only need a pop. */
11054 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
11055 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11056 /* Leave results in shorter dependency chains on CPUs that are
11057 able to grok it fast. */
11058 else if (TARGET_USE_LEAVE
11059 || optimize_function_for_size_p (cfun)
11060 || !cfun->machine->use_fast_prologue_epilogue)
11061 ix86_emit_leave ();
11062 else
11064 pro_epilogue_adjust_stack (stack_pointer_rtx,
11065 hard_frame_pointer_rtx,
11066 const0_rtx, style, !using_drap);
11067 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11071 if (using_drap)
11073 int param_ptr_offset = UNITS_PER_WORD;
11074 rtx insn;
11076 gcc_assert (stack_realign_drap);
11078 if (ix86_static_chain_on_stack)
11079 param_ptr_offset += UNITS_PER_WORD;
11080 if (!call_used_regs[REGNO (crtl->drap_reg)])
11081 param_ptr_offset += UNITS_PER_WORD;
11083 insn = emit_insn (gen_rtx_SET
11084 (VOIDmode, stack_pointer_rtx,
11085 gen_rtx_PLUS (Pmode,
11086 crtl->drap_reg,
11087 GEN_INT (-param_ptr_offset))));
11088 m->fs.cfa_reg = stack_pointer_rtx;
11089 m->fs.cfa_offset = param_ptr_offset;
11090 m->fs.sp_offset = param_ptr_offset;
11091 m->fs.realigned = false;
11093 add_reg_note (insn, REG_CFA_DEF_CFA,
11094 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11095 GEN_INT (param_ptr_offset)));
11096 RTX_FRAME_RELATED_P (insn) = 1;
11098 if (!call_used_regs[REGNO (crtl->drap_reg)])
11099 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
11102 /* At this point the stack pointer must be valid, and we must have
11103 restored all of the registers. We may not have deallocated the
11104 entire stack frame. We've delayed this until now because it may
11105 be possible to merge the local stack deallocation with the
11106 deallocation forced by ix86_static_chain_on_stack. */
11107 gcc_assert (m->fs.sp_valid);
11108 gcc_assert (!m->fs.fp_valid);
11109 gcc_assert (!m->fs.realigned);
11110 if (m->fs.sp_offset != UNITS_PER_WORD)
11112 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11113 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
11114 style, true);
11116 else
11117 ix86_add_queued_cfa_restore_notes (get_last_insn ());
11119 /* Sibcall epilogues don't want a return instruction. */
11120 if (style == 0)
11122 m->fs = frame_state_save;
11123 return;
11126 if (crtl->args.pops_args && crtl->args.size)
11128 rtx popc = GEN_INT (crtl->args.pops_args);
11130 /* i386 can only pop 64K bytes. If asked to pop more, pop return
11131 address, do explicit add, and jump indirectly to the caller. */
11133 if (crtl->args.pops_args >= 65536)
11135 rtx ecx = gen_rtx_REG (SImode, CX_REG);
11136 rtx insn;
11138 /* There is no "pascal" calling convention in any 64bit ABI. */
11139 gcc_assert (!TARGET_64BIT);
11141 insn = emit_insn (gen_pop (ecx));
11142 m->fs.cfa_offset -= UNITS_PER_WORD;
11143 m->fs.sp_offset -= UNITS_PER_WORD;
11145 add_reg_note (insn, REG_CFA_ADJUST_CFA,
11146 copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
11147 add_reg_note (insn, REG_CFA_REGISTER,
11148 gen_rtx_SET (VOIDmode, ecx, pc_rtx));
11149 RTX_FRAME_RELATED_P (insn) = 1;
11151 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11152 popc, -1, true);
11153 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
11155 else
11156 emit_jump_insn (gen_simple_return_pop_internal (popc));
11158 else
11159 emit_jump_insn (gen_simple_return_internal ());
11161 /* Restore the state back to the state from the prologue,
11162 so that it's correct for the next epilogue. */
11163 m->fs = frame_state_save;
11166 /* Reset from the function's potential modifications. */
11168 static void
11169 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
11170 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
11172 if (pic_offset_table_rtx)
11173 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
11174 #if TARGET_MACHO
11175 /* Mach-O doesn't support labels at the end of objects, so if
11176 it looks like we might want one, insert a NOP. */
11178 rtx insn = get_last_insn ();
11179 rtx deleted_debug_label = NULL_RTX;
11180 while (insn
11181 && NOTE_P (insn)
11182 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
11184 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
11185 notes only, instead set their CODE_LABEL_NUMBER to -1,
11186 otherwise there would be code generation differences
11187 in between -g and -g0. */
11188 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11189 deleted_debug_label = insn;
11190 insn = PREV_INSN (insn);
11192 if (insn
11193 && (LABEL_P (insn)
11194 || (NOTE_P (insn)
11195 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
11196 fputs ("\tnop\n", file);
11197 else if (deleted_debug_label)
11198 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
11199 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11200 CODE_LABEL_NUMBER (insn) = -1;
11202 #endif
11206 /* Return a scratch register to use in the split stack prologue. The
11207 split stack prologue is used for -fsplit-stack. It is the first
11208 instructions in the function, even before the regular prologue.
11209 The scratch register can be any caller-saved register which is not
11210 used for parameters or for the static chain. */
11212 static unsigned int
11213 split_stack_prologue_scratch_regno (void)
11215 if (TARGET_64BIT)
11216 return R11_REG;
11217 else
11219 bool is_fastcall, is_thiscall;
11220 int regparm;
11222 is_fastcall = (lookup_attribute ("fastcall",
11223 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11224 != NULL);
11225 is_thiscall = (lookup_attribute ("thiscall",
11226 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11227 != NULL);
11228 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
11230 if (is_fastcall)
11232 if (DECL_STATIC_CHAIN (cfun->decl))
11234 sorry ("-fsplit-stack does not support fastcall with "
11235 "nested function");
11236 return INVALID_REGNUM;
11238 return AX_REG;
11240 else if (is_thiscall)
11242 if (!DECL_STATIC_CHAIN (cfun->decl))
11243 return DX_REG;
11244 return AX_REG;
11246 else if (regparm < 3)
11248 if (!DECL_STATIC_CHAIN (cfun->decl))
11249 return CX_REG;
11250 else
11252 if (regparm >= 2)
11254 sorry ("-fsplit-stack does not support 2 register "
11255 " parameters for a nested function");
11256 return INVALID_REGNUM;
11258 return DX_REG;
11261 else
11263 /* FIXME: We could make this work by pushing a register
11264 around the addition and comparison. */
11265 sorry ("-fsplit-stack does not support 3 register parameters");
11266 return INVALID_REGNUM;
11271 /* A SYMBOL_REF for the function which allocates new stackspace for
11272 -fsplit-stack. */
11274 static GTY(()) rtx split_stack_fn;
11276 /* A SYMBOL_REF for the more stack function when using the large
11277 model. */
11279 static GTY(()) rtx split_stack_fn_large;
11281 /* Handle -fsplit-stack. These are the first instructions in the
11282 function, even before the regular prologue. */
11284 void
11285 ix86_expand_split_stack_prologue (void)
11287 struct ix86_frame frame;
11288 HOST_WIDE_INT allocate;
11289 unsigned HOST_WIDE_INT args_size;
11290 rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
11291 rtx scratch_reg = NULL_RTX;
11292 rtx varargs_label = NULL_RTX;
11293 rtx fn;
11295 gcc_assert (flag_split_stack && reload_completed);
11297 ix86_finalize_stack_realign_flags ();
11298 ix86_compute_frame_layout (&frame);
11299 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
11301 /* This is the label we will branch to if we have enough stack
11302 space. We expect the basic block reordering pass to reverse this
11303 branch if optimizing, so that we branch in the unlikely case. */
11304 label = gen_label_rtx ();
11306 /* We need to compare the stack pointer minus the frame size with
11307 the stack boundary in the TCB. The stack boundary always gives
11308 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
11309 can compare directly. Otherwise we need to do an addition. */
11311 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
11312 UNSPEC_STACK_CHECK);
11313 limit = gen_rtx_CONST (Pmode, limit);
11314 limit = gen_rtx_MEM (Pmode, limit);
11315 if (allocate < SPLIT_STACK_AVAILABLE)
11316 current = stack_pointer_rtx;
11317 else
11319 unsigned int scratch_regno;
11320 rtx offset;
11322 /* We need a scratch register to hold the stack pointer minus
11323 the required frame size. Since this is the very start of the
11324 function, the scratch register can be any caller-saved
11325 register which is not used for parameters. */
11326 offset = GEN_INT (- allocate);
11327 scratch_regno = split_stack_prologue_scratch_regno ();
11328 if (scratch_regno == INVALID_REGNUM)
11329 return;
11330 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11331 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11333 /* We don't use ix86_gen_add3 in this case because it will
11334 want to split to lea, but when not optimizing the insn
11335 will not be split after this point. */
11336 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11337 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11338 offset)));
11340 else
11342 emit_move_insn (scratch_reg, offset);
11343 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
11344 stack_pointer_rtx));
11346 current = scratch_reg;
11349 ix86_expand_branch (GEU, current, limit, label);
11350 jump_insn = get_last_insn ();
11351 JUMP_LABEL (jump_insn) = label;
11353 /* Mark the jump as very likely to be taken. */
11354 add_reg_note (jump_insn, REG_BR_PROB,
11355 GEN_INT (REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100));
11357 if (split_stack_fn == NULL_RTX)
11358 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11359 fn = split_stack_fn;
11361 /* Get more stack space. We pass in the desired stack space and the
11362 size of the arguments to copy to the new stack. In 32-bit mode
11363 we push the parameters; __morestack will return on a new stack
11364 anyhow. In 64-bit mode we pass the parameters in r10 and
11365 r11. */
11366 allocate_rtx = GEN_INT (allocate);
11367 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11368 call_fusage = NULL_RTX;
11369 if (TARGET_64BIT)
11371 rtx reg10, reg11;
11373 reg10 = gen_rtx_REG (Pmode, R10_REG);
11374 reg11 = gen_rtx_REG (Pmode, R11_REG);
11376 /* If this function uses a static chain, it will be in %r10.
11377 Preserve it across the call to __morestack. */
11378 if (DECL_STATIC_CHAIN (cfun->decl))
11380 rtx rax;
11382 rax = gen_rtx_REG (word_mode, AX_REG);
11383 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
11384 use_reg (&call_fusage, rax);
11387 if (ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11389 HOST_WIDE_INT argval;
11391 gcc_assert (Pmode == DImode);
11392 /* When using the large model we need to load the address
11393 into a register, and we've run out of registers. So we
11394 switch to a different calling convention, and we call a
11395 different function: __morestack_large. We pass the
11396 argument size in the upper 32 bits of r10 and pass the
11397 frame size in the lower 32 bits. */
11398 gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11399 gcc_assert ((args_size & 0xffffffff) == args_size);
11401 if (split_stack_fn_large == NULL_RTX)
11402 split_stack_fn_large =
11403 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11405 if (ix86_cmodel == CM_LARGE_PIC)
11407 rtx label, x;
11409 label = gen_label_rtx ();
11410 emit_label (label);
11411 LABEL_PRESERVE_P (label) = 1;
11412 emit_insn (gen_set_rip_rex64 (reg10, label));
11413 emit_insn (gen_set_got_offset_rex64 (reg11, label));
11414 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
11415 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11416 UNSPEC_GOT);
11417 x = gen_rtx_CONST (Pmode, x);
11418 emit_move_insn (reg11, x);
11419 x = gen_rtx_PLUS (Pmode, reg10, reg11);
11420 x = gen_const_mem (Pmode, x);
11421 emit_move_insn (reg11, x);
11423 else
11424 emit_move_insn (reg11, split_stack_fn_large);
11426 fn = reg11;
11428 argval = ((args_size << 16) << 16) + allocate;
11429 emit_move_insn (reg10, GEN_INT (argval));
11431 else
11433 emit_move_insn (reg10, allocate_rtx);
11434 emit_move_insn (reg11, GEN_INT (args_size));
11435 use_reg (&call_fusage, reg11);
11438 use_reg (&call_fusage, reg10);
11440 else
11442 emit_insn (gen_push (GEN_INT (args_size)));
11443 emit_insn (gen_push (allocate_rtx));
11445 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11446 GEN_INT (UNITS_PER_WORD), constm1_rtx,
11447 NULL_RTX, false);
11448 add_function_usage_to (call_insn, call_fusage);
11450 /* In order to make call/return prediction work right, we now need
11451 to execute a return instruction. See
11452 libgcc/config/i386/morestack.S for the details on how this works.
11454 For flow purposes gcc must not see this as a return
11455 instruction--we need control flow to continue at the subsequent
11456 label. Therefore, we use an unspec. */
11457 gcc_assert (crtl->args.pops_args < 65536);
11458 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11460 /* If we are in 64-bit mode and this function uses a static chain,
11461 we saved %r10 in %rax before calling _morestack. */
11462 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11463 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
11464 gen_rtx_REG (word_mode, AX_REG));
11466 /* If this function calls va_start, we need to store a pointer to
11467 the arguments on the old stack, because they may not have been
11468 all copied to the new stack. At this point the old stack can be
11469 found at the frame pointer value used by __morestack, because
11470 __morestack has set that up before calling back to us. Here we
11471 store that pointer in a scratch register, and in
11472 ix86_expand_prologue we store the scratch register in a stack
11473 slot. */
11474 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11476 unsigned int scratch_regno;
11477 rtx frame_reg;
11478 int words;
11480 scratch_regno = split_stack_prologue_scratch_regno ();
11481 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11482 frame_reg = gen_rtx_REG (Pmode, BP_REG);
11484 /* 64-bit:
11485 fp -> old fp value
11486 return address within this function
11487 return address of caller of this function
11488 stack arguments
11489 So we add three words to get to the stack arguments.
11491 32-bit:
11492 fp -> old fp value
11493 return address within this function
11494 first argument to __morestack
11495 second argument to __morestack
11496 return address of caller of this function
11497 stack arguments
11498 So we add five words to get to the stack arguments.
11500 words = TARGET_64BIT ? 3 : 5;
11501 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11502 gen_rtx_PLUS (Pmode, frame_reg,
11503 GEN_INT (words * UNITS_PER_WORD))));
11505 varargs_label = gen_label_rtx ();
11506 emit_jump_insn (gen_jump (varargs_label));
11507 JUMP_LABEL (get_last_insn ()) = varargs_label;
11509 emit_barrier ();
11512 emit_label (label);
11513 LABEL_NUSES (label) = 1;
11515 /* If this function calls va_start, we now have to set the scratch
11516 register for the case where we do not call __morestack. In this
11517 case we need to set it based on the stack pointer. */
11518 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11520 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11521 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11522 GEN_INT (UNITS_PER_WORD))));
11524 emit_label (varargs_label);
11525 LABEL_NUSES (varargs_label) = 1;
11529 /* We may have to tell the dataflow pass that the split stack prologue
11530 is initializing a scratch register. */
11532 static void
11533 ix86_live_on_entry (bitmap regs)
11535 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11537 gcc_assert (flag_split_stack);
11538 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11542 /* Determine if op is suitable SUBREG RTX for address. */
11544 static bool
11545 ix86_address_subreg_operand (rtx op)
11547 enum machine_mode mode;
11549 if (!REG_P (op))
11550 return false;
11552 mode = GET_MODE (op);
11554 if (GET_MODE_CLASS (mode) != MODE_INT)
11555 return false;
11557 /* Don't allow SUBREGs that span more than a word. It can lead to spill
11558 failures when the register is one word out of a two word structure. */
11559 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
11560 return false;
11562 /* Allow only SUBREGs of non-eliminable hard registers. */
11563 return register_no_elim_operand (op, mode);
11566 /* Extract the parts of an RTL expression that is a valid memory address
11567 for an instruction. Return 0 if the structure of the address is
11568 grossly off. Return -1 if the address contains ASHIFT, so it is not
11569 strictly valid, but still used for computing length of lea instruction. */
11572 ix86_decompose_address (rtx addr, struct ix86_address *out)
11574 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11575 rtx base_reg, index_reg;
11576 HOST_WIDE_INT scale = 1;
11577 rtx scale_rtx = NULL_RTX;
11578 rtx tmp;
11579 int retval = 1;
11580 enum ix86_address_seg seg = SEG_DEFAULT;
11582 /* Allow zero-extended SImode addresses,
11583 they will be emitted with addr32 prefix. */
11584 if (TARGET_64BIT && GET_MODE (addr) == DImode)
11586 if (GET_CODE (addr) == ZERO_EXTEND
11587 && GET_MODE (XEXP (addr, 0)) == SImode)
11589 addr = XEXP (addr, 0);
11590 if (CONST_INT_P (addr))
11591 return 0;
11593 else if (GET_CODE (addr) == AND
11594 && const_32bit_mask (XEXP (addr, 1), DImode))
11596 addr = simplify_gen_subreg (SImode, XEXP (addr, 0), DImode, 0);
11597 if (addr == NULL_RTX)
11598 return 0;
11600 if (CONST_INT_P (addr))
11601 return 0;
11605 /* Allow SImode subregs of DImode addresses,
11606 they will be emitted with addr32 prefix. */
11607 if (TARGET_64BIT && GET_MODE (addr) == SImode)
11609 if (GET_CODE (addr) == SUBREG
11610 && GET_MODE (SUBREG_REG (addr)) == DImode)
11612 addr = SUBREG_REG (addr);
11613 if (CONST_INT_P (addr))
11614 return 0;
11618 if (REG_P (addr))
11619 base = addr;
11620 else if (GET_CODE (addr) == SUBREG)
11622 if (ix86_address_subreg_operand (SUBREG_REG (addr)))
11623 base = addr;
11624 else
11625 return 0;
11627 else if (GET_CODE (addr) == PLUS)
11629 rtx addends[4], op;
11630 int n = 0, i;
11632 op = addr;
11635 if (n >= 4)
11636 return 0;
11637 addends[n++] = XEXP (op, 1);
11638 op = XEXP (op, 0);
11640 while (GET_CODE (op) == PLUS);
11641 if (n >= 4)
11642 return 0;
11643 addends[n] = op;
11645 for (i = n; i >= 0; --i)
11647 op = addends[i];
11648 switch (GET_CODE (op))
11650 case MULT:
11651 if (index)
11652 return 0;
11653 index = XEXP (op, 0);
11654 scale_rtx = XEXP (op, 1);
11655 break;
11657 case ASHIFT:
11658 if (index)
11659 return 0;
11660 index = XEXP (op, 0);
11661 tmp = XEXP (op, 1);
11662 if (!CONST_INT_P (tmp))
11663 return 0;
11664 scale = INTVAL (tmp);
11665 if ((unsigned HOST_WIDE_INT) scale > 3)
11666 return 0;
11667 scale = 1 << scale;
11668 break;
11670 case ZERO_EXTEND:
11671 op = XEXP (op, 0);
11672 if (GET_CODE (op) != UNSPEC)
11673 return 0;
11674 /* FALLTHRU */
11676 case UNSPEC:
11677 if (XINT (op, 1) == UNSPEC_TP
11678 && TARGET_TLS_DIRECT_SEG_REFS
11679 && seg == SEG_DEFAULT)
11680 seg = TARGET_64BIT ? SEG_FS : SEG_GS;
11681 else
11682 return 0;
11683 break;
11685 case SUBREG:
11686 if (!ix86_address_subreg_operand (SUBREG_REG (op)))
11687 return 0;
11688 /* FALLTHRU */
11690 case REG:
11691 if (!base)
11692 base = op;
11693 else if (!index)
11694 index = op;
11695 else
11696 return 0;
11697 break;
11699 case CONST:
11700 case CONST_INT:
11701 case SYMBOL_REF:
11702 case LABEL_REF:
11703 if (disp)
11704 return 0;
11705 disp = op;
11706 break;
11708 default:
11709 return 0;
11713 else if (GET_CODE (addr) == MULT)
11715 index = XEXP (addr, 0); /* index*scale */
11716 scale_rtx = XEXP (addr, 1);
11718 else if (GET_CODE (addr) == ASHIFT)
11720 /* We're called for lea too, which implements ashift on occasion. */
11721 index = XEXP (addr, 0);
11722 tmp = XEXP (addr, 1);
11723 if (!CONST_INT_P (tmp))
11724 return 0;
11725 scale = INTVAL (tmp);
11726 if ((unsigned HOST_WIDE_INT) scale > 3)
11727 return 0;
11728 scale = 1 << scale;
11729 retval = -1;
11731 else if (CONST_INT_P (addr))
11733 if (!x86_64_immediate_operand (addr, VOIDmode))
11734 return 0;
11736 /* Constant addresses are sign extended to 64bit, we have to
11737 prevent addresses from 0x80000000 to 0xffffffff in x32 mode. */
11738 if (TARGET_X32
11739 && val_signbit_known_set_p (SImode, INTVAL (addr)))
11740 return 0;
11742 disp = addr;
11744 else
11745 disp = addr; /* displacement */
11747 if (index)
11749 if (REG_P (index))
11751 else if (GET_CODE (index) == SUBREG
11752 && ix86_address_subreg_operand (SUBREG_REG (index)))
11754 else
11755 return 0;
11758 /* Address override works only on the (%reg) part of %fs:(%reg). */
11759 if (seg != SEG_DEFAULT
11760 && ((base && GET_MODE (base) != word_mode)
11761 || (index && GET_MODE (index) != word_mode)))
11762 return 0;
11764 /* Extract the integral value of scale. */
11765 if (scale_rtx)
11767 if (!CONST_INT_P (scale_rtx))
11768 return 0;
11769 scale = INTVAL (scale_rtx);
11772 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
11773 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
11775 /* Avoid useless 0 displacement. */
11776 if (disp == const0_rtx && (base || index))
11777 disp = NULL_RTX;
11779 /* Allow arg pointer and stack pointer as index if there is not scaling. */
11780 if (base_reg && index_reg && scale == 1
11781 && (index_reg == arg_pointer_rtx
11782 || index_reg == frame_pointer_rtx
11783 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
11785 rtx tmp;
11786 tmp = base, base = index, index = tmp;
11787 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
11790 /* Special case: %ebp cannot be encoded as a base without a displacement.
11791 Similarly %r13. */
11792 if (!disp
11793 && base_reg
11794 && (base_reg == hard_frame_pointer_rtx
11795 || base_reg == frame_pointer_rtx
11796 || base_reg == arg_pointer_rtx
11797 || (REG_P (base_reg)
11798 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
11799 || REGNO (base_reg) == R13_REG))))
11800 disp = const0_rtx;
11802 /* Special case: on K6, [%esi] makes the instruction vector decoded.
11803 Avoid this by transforming to [%esi+0].
11804 Reload calls address legitimization without cfun defined, so we need
11805 to test cfun for being non-NULL. */
11806 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
11807 && base_reg && !index_reg && !disp
11808 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
11809 disp = const0_rtx;
11811 /* Special case: encode reg+reg instead of reg*2. */
11812 if (!base && index && scale == 2)
11813 base = index, base_reg = index_reg, scale = 1;
11815 /* Special case: scaling cannot be encoded without base or displacement. */
11816 if (!base && !disp && index && scale != 1)
11817 disp = const0_rtx;
11819 out->base = base;
11820 out->index = index;
11821 out->disp = disp;
11822 out->scale = scale;
11823 out->seg = seg;
11825 return retval;
11828 /* Return cost of the memory address x.
11829 For i386, it is better to use a complex address than let gcc copy
11830 the address into a reg and make a new pseudo. But not if the address
11831 requires to two regs - that would mean more pseudos with longer
11832 lifetimes. */
11833 static int
11834 ix86_address_cost (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED,
11835 addr_space_t as ATTRIBUTE_UNUSED,
11836 bool speed ATTRIBUTE_UNUSED)
11838 struct ix86_address parts;
11839 int cost = 1;
11840 int ok = ix86_decompose_address (x, &parts);
11842 gcc_assert (ok);
11844 if (parts.base && GET_CODE (parts.base) == SUBREG)
11845 parts.base = SUBREG_REG (parts.base);
11846 if (parts.index && GET_CODE (parts.index) == SUBREG)
11847 parts.index = SUBREG_REG (parts.index);
11849 /* Attempt to minimize number of registers in the address. */
11850 if ((parts.base
11851 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
11852 || (parts.index
11853 && (!REG_P (parts.index)
11854 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
11855 cost++;
11857 if (parts.base
11858 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
11859 && parts.index
11860 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
11861 && parts.base != parts.index)
11862 cost++;
11864 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
11865 since it's predecode logic can't detect the length of instructions
11866 and it degenerates to vector decoded. Increase cost of such
11867 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
11868 to split such addresses or even refuse such addresses at all.
11870 Following addressing modes are affected:
11871 [base+scale*index]
11872 [scale*index+disp]
11873 [base+index]
11875 The first and last case may be avoidable by explicitly coding the zero in
11876 memory address, but I don't have AMD-K6 machine handy to check this
11877 theory. */
11879 if (TARGET_K6
11880 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
11881 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
11882 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
11883 cost += 10;
11885 return cost;
11888 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
11889 this is used for to form addresses to local data when -fPIC is in
11890 use. */
11892 static bool
11893 darwin_local_data_pic (rtx disp)
11895 return (GET_CODE (disp) == UNSPEC
11896 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
11899 /* Determine if a given RTX is a valid constant. We already know this
11900 satisfies CONSTANT_P. */
11902 static bool
11903 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
11905 switch (GET_CODE (x))
11907 case CONST:
11908 x = XEXP (x, 0);
11910 if (GET_CODE (x) == PLUS)
11912 if (!CONST_INT_P (XEXP (x, 1)))
11913 return false;
11914 x = XEXP (x, 0);
11917 if (TARGET_MACHO && darwin_local_data_pic (x))
11918 return true;
11920 /* Only some unspecs are valid as "constants". */
11921 if (GET_CODE (x) == UNSPEC)
11922 switch (XINT (x, 1))
11924 case UNSPEC_GOT:
11925 case UNSPEC_GOTOFF:
11926 case UNSPEC_PLTOFF:
11927 return TARGET_64BIT;
11928 case UNSPEC_TPOFF:
11929 case UNSPEC_NTPOFF:
11930 x = XVECEXP (x, 0, 0);
11931 return (GET_CODE (x) == SYMBOL_REF
11932 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11933 case UNSPEC_DTPOFF:
11934 x = XVECEXP (x, 0, 0);
11935 return (GET_CODE (x) == SYMBOL_REF
11936 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
11937 default:
11938 return false;
11941 /* We must have drilled down to a symbol. */
11942 if (GET_CODE (x) == LABEL_REF)
11943 return true;
11944 if (GET_CODE (x) != SYMBOL_REF)
11945 return false;
11946 /* FALLTHRU */
11948 case SYMBOL_REF:
11949 /* TLS symbols are never valid. */
11950 if (SYMBOL_REF_TLS_MODEL (x))
11951 return false;
11953 /* DLLIMPORT symbols are never valid. */
11954 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
11955 && SYMBOL_REF_DLLIMPORT_P (x))
11956 return false;
11958 #if TARGET_MACHO
11959 /* mdynamic-no-pic */
11960 if (MACHO_DYNAMIC_NO_PIC_P)
11961 return machopic_symbol_defined_p (x);
11962 #endif
11963 break;
11965 case CONST_DOUBLE:
11966 if (GET_MODE (x) == TImode
11967 && x != CONST0_RTX (TImode)
11968 && !TARGET_64BIT)
11969 return false;
11970 break;
11972 case CONST_VECTOR:
11973 if (!standard_sse_constant_p (x))
11974 return false;
11976 default:
11977 break;
11980 /* Otherwise we handle everything else in the move patterns. */
11981 return true;
11984 /* Determine if it's legal to put X into the constant pool. This
11985 is not possible for the address of thread-local symbols, which
11986 is checked above. */
11988 static bool
11989 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
11991 /* We can always put integral constants and vectors in memory. */
11992 switch (GET_CODE (x))
11994 case CONST_INT:
11995 case CONST_DOUBLE:
11996 case CONST_VECTOR:
11997 return false;
11999 default:
12000 break;
12002 return !ix86_legitimate_constant_p (mode, x);
12006 /* Nonzero if the constant value X is a legitimate general operand
12007 when generating PIC code. It is given that flag_pic is on and
12008 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
12010 bool
12011 legitimate_pic_operand_p (rtx x)
12013 rtx inner;
12015 switch (GET_CODE (x))
12017 case CONST:
12018 inner = XEXP (x, 0);
12019 if (GET_CODE (inner) == PLUS
12020 && CONST_INT_P (XEXP (inner, 1)))
12021 inner = XEXP (inner, 0);
12023 /* Only some unspecs are valid as "constants". */
12024 if (GET_CODE (inner) == UNSPEC)
12025 switch (XINT (inner, 1))
12027 case UNSPEC_GOT:
12028 case UNSPEC_GOTOFF:
12029 case UNSPEC_PLTOFF:
12030 return TARGET_64BIT;
12031 case UNSPEC_TPOFF:
12032 x = XVECEXP (inner, 0, 0);
12033 return (GET_CODE (x) == SYMBOL_REF
12034 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
12035 case UNSPEC_MACHOPIC_OFFSET:
12036 return legitimate_pic_address_disp_p (x);
12037 default:
12038 return false;
12040 /* FALLTHRU */
12042 case SYMBOL_REF:
12043 case LABEL_REF:
12044 return legitimate_pic_address_disp_p (x);
12046 default:
12047 return true;
12051 /* Determine if a given CONST RTX is a valid memory displacement
12052 in PIC mode. */
12054 bool
12055 legitimate_pic_address_disp_p (rtx disp)
12057 bool saw_plus;
12059 /* In 64bit mode we can allow direct addresses of symbols and labels
12060 when they are not dynamic symbols. */
12061 if (TARGET_64BIT)
12063 rtx op0 = disp, op1;
12065 switch (GET_CODE (disp))
12067 case LABEL_REF:
12068 return true;
12070 case CONST:
12071 if (GET_CODE (XEXP (disp, 0)) != PLUS)
12072 break;
12073 op0 = XEXP (XEXP (disp, 0), 0);
12074 op1 = XEXP (XEXP (disp, 0), 1);
12075 if (!CONST_INT_P (op1)
12076 || INTVAL (op1) >= 16*1024*1024
12077 || INTVAL (op1) < -16*1024*1024)
12078 break;
12079 if (GET_CODE (op0) == LABEL_REF)
12080 return true;
12081 if (GET_CODE (op0) == CONST
12082 && GET_CODE (XEXP (op0, 0)) == UNSPEC
12083 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
12084 return true;
12085 if (GET_CODE (op0) == UNSPEC
12086 && XINT (op0, 1) == UNSPEC_PCREL)
12087 return true;
12088 if (GET_CODE (op0) != SYMBOL_REF)
12089 break;
12090 /* FALLTHRU */
12092 case SYMBOL_REF:
12093 /* TLS references should always be enclosed in UNSPEC. */
12094 if (SYMBOL_REF_TLS_MODEL (op0))
12095 return false;
12096 if (!SYMBOL_REF_FAR_ADDR_P (op0) && SYMBOL_REF_LOCAL_P (op0)
12097 && ix86_cmodel != CM_LARGE_PIC)
12098 return true;
12099 break;
12101 default:
12102 break;
12105 if (GET_CODE (disp) != CONST)
12106 return false;
12107 disp = XEXP (disp, 0);
12109 if (TARGET_64BIT)
12111 /* We are unsafe to allow PLUS expressions. This limit allowed distance
12112 of GOT tables. We should not need these anyway. */
12113 if (GET_CODE (disp) != UNSPEC
12114 || (XINT (disp, 1) != UNSPEC_GOTPCREL
12115 && XINT (disp, 1) != UNSPEC_GOTOFF
12116 && XINT (disp, 1) != UNSPEC_PCREL
12117 && XINT (disp, 1) != UNSPEC_PLTOFF))
12118 return false;
12120 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
12121 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
12122 return false;
12123 return true;
12126 saw_plus = false;
12127 if (GET_CODE (disp) == PLUS)
12129 if (!CONST_INT_P (XEXP (disp, 1)))
12130 return false;
12131 disp = XEXP (disp, 0);
12132 saw_plus = true;
12135 if (TARGET_MACHO && darwin_local_data_pic (disp))
12136 return true;
12138 if (GET_CODE (disp) != UNSPEC)
12139 return false;
12141 switch (XINT (disp, 1))
12143 case UNSPEC_GOT:
12144 if (saw_plus)
12145 return false;
12146 /* We need to check for both symbols and labels because VxWorks loads
12147 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
12148 details. */
12149 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12150 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
12151 case UNSPEC_GOTOFF:
12152 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
12153 While ABI specify also 32bit relocation but we don't produce it in
12154 small PIC model at all. */
12155 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12156 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
12157 && !TARGET_64BIT)
12158 return gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
12159 return false;
12160 case UNSPEC_GOTTPOFF:
12161 case UNSPEC_GOTNTPOFF:
12162 case UNSPEC_INDNTPOFF:
12163 if (saw_plus)
12164 return false;
12165 disp = XVECEXP (disp, 0, 0);
12166 return (GET_CODE (disp) == SYMBOL_REF
12167 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
12168 case UNSPEC_NTPOFF:
12169 disp = XVECEXP (disp, 0, 0);
12170 return (GET_CODE (disp) == SYMBOL_REF
12171 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
12172 case UNSPEC_DTPOFF:
12173 disp = XVECEXP (disp, 0, 0);
12174 return (GET_CODE (disp) == SYMBOL_REF
12175 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
12178 return false;
12181 /* Our implementation of LEGITIMIZE_RELOAD_ADDRESS. Returns a value to
12182 replace the input X, or the original X if no replacement is called for.
12183 The output parameter *WIN is 1 if the calling macro should goto WIN,
12184 0 if it should not. */
12186 bool
12187 ix86_legitimize_reload_address (rtx x,
12188 enum machine_mode mode ATTRIBUTE_UNUSED,
12189 int opnum, int type,
12190 int ind_levels ATTRIBUTE_UNUSED)
12192 /* Reload can generate:
12194 (plus:DI (plus:DI (unspec:DI [(const_int 0 [0])] UNSPEC_TP)
12195 (reg:DI 97))
12196 (reg:DI 2 cx))
12198 This RTX is rejected from ix86_legitimate_address_p due to
12199 non-strictness of base register 97. Following this rejection,
12200 reload pushes all three components into separate registers,
12201 creating invalid memory address RTX.
12203 Following code reloads only the invalid part of the
12204 memory address RTX. */
12206 if (GET_CODE (x) == PLUS
12207 && REG_P (XEXP (x, 1))
12208 && GET_CODE (XEXP (x, 0)) == PLUS
12209 && REG_P (XEXP (XEXP (x, 0), 1)))
12211 rtx base, index;
12212 bool something_reloaded = false;
12214 base = XEXP (XEXP (x, 0), 1);
12215 if (!REG_OK_FOR_BASE_STRICT_P (base))
12217 push_reload (base, NULL_RTX, &XEXP (XEXP (x, 0), 1), NULL,
12218 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12219 opnum, (enum reload_type) type);
12220 something_reloaded = true;
12223 index = XEXP (x, 1);
12224 if (!REG_OK_FOR_INDEX_STRICT_P (index))
12226 push_reload (index, NULL_RTX, &XEXP (x, 1), NULL,
12227 INDEX_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12228 opnum, (enum reload_type) type);
12229 something_reloaded = true;
12232 gcc_assert (something_reloaded);
12233 return true;
12236 return false;
12239 /* Recognizes RTL expressions that are valid memory addresses for an
12240 instruction. The MODE argument is the machine mode for the MEM
12241 expression that wants to use this address.
12243 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
12244 convert common non-canonical forms to canonical form so that they will
12245 be recognized. */
12247 static bool
12248 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
12249 rtx addr, bool strict)
12251 struct ix86_address parts;
12252 rtx base, index, disp;
12253 HOST_WIDE_INT scale;
12255 if (ix86_decompose_address (addr, &parts) <= 0)
12256 /* Decomposition failed. */
12257 return false;
12259 base = parts.base;
12260 index = parts.index;
12261 disp = parts.disp;
12262 scale = parts.scale;
12264 /* Validate base register. */
12265 if (base)
12267 rtx reg;
12269 if (REG_P (base))
12270 reg = base;
12271 else if (GET_CODE (base) == SUBREG && REG_P (SUBREG_REG (base)))
12272 reg = SUBREG_REG (base);
12273 else
12274 /* Base is not a register. */
12275 return false;
12277 if (GET_MODE (base) != SImode && GET_MODE (base) != DImode)
12278 return false;
12280 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
12281 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
12282 /* Base is not valid. */
12283 return false;
12286 /* Validate index register. */
12287 if (index)
12289 rtx reg;
12291 if (REG_P (index))
12292 reg = index;
12293 else if (GET_CODE (index) == SUBREG && REG_P (SUBREG_REG (index)))
12294 reg = SUBREG_REG (index);
12295 else
12296 /* Index is not a register. */
12297 return false;
12299 if (GET_MODE (index) != SImode && GET_MODE (index) != DImode)
12300 return false;
12302 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
12303 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
12304 /* Index is not valid. */
12305 return false;
12308 /* Index and base should have the same mode. */
12309 if (base && index
12310 && GET_MODE (base) != GET_MODE (index))
12311 return false;
12313 /* Validate scale factor. */
12314 if (scale != 1)
12316 if (!index)
12317 /* Scale without index. */
12318 return false;
12320 if (scale != 2 && scale != 4 && scale != 8)
12321 /* Scale is not a valid multiplier. */
12322 return false;
12325 /* Validate displacement. */
12326 if (disp)
12328 if (GET_CODE (disp) == CONST
12329 && GET_CODE (XEXP (disp, 0)) == UNSPEC
12330 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
12331 switch (XINT (XEXP (disp, 0), 1))
12333 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
12334 used. While ABI specify also 32bit relocations, we don't produce
12335 them at all and use IP relative instead. */
12336 case UNSPEC_GOT:
12337 case UNSPEC_GOTOFF:
12338 gcc_assert (flag_pic);
12339 if (!TARGET_64BIT)
12340 goto is_legitimate_pic;
12342 /* 64bit address unspec. */
12343 return false;
12345 case UNSPEC_GOTPCREL:
12346 case UNSPEC_PCREL:
12347 gcc_assert (flag_pic);
12348 goto is_legitimate_pic;
12350 case UNSPEC_GOTTPOFF:
12351 case UNSPEC_GOTNTPOFF:
12352 case UNSPEC_INDNTPOFF:
12353 case UNSPEC_NTPOFF:
12354 case UNSPEC_DTPOFF:
12355 break;
12357 case UNSPEC_STACK_CHECK:
12358 gcc_assert (flag_split_stack);
12359 break;
12361 default:
12362 /* Invalid address unspec. */
12363 return false;
12366 else if (SYMBOLIC_CONST (disp)
12367 && (flag_pic
12368 || (TARGET_MACHO
12369 #if TARGET_MACHO
12370 && MACHOPIC_INDIRECT
12371 && !machopic_operand_p (disp)
12372 #endif
12376 is_legitimate_pic:
12377 if (TARGET_64BIT && (index || base))
12379 /* foo@dtpoff(%rX) is ok. */
12380 if (GET_CODE (disp) != CONST
12381 || GET_CODE (XEXP (disp, 0)) != PLUS
12382 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
12383 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
12384 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
12385 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
12386 /* Non-constant pic memory reference. */
12387 return false;
12389 else if ((!TARGET_MACHO || flag_pic)
12390 && ! legitimate_pic_address_disp_p (disp))
12391 /* Displacement is an invalid pic construct. */
12392 return false;
12393 #if TARGET_MACHO
12394 else if (MACHO_DYNAMIC_NO_PIC_P
12395 && !ix86_legitimate_constant_p (Pmode, disp))
12396 /* displacment must be referenced via non_lazy_pointer */
12397 return false;
12398 #endif
12400 /* This code used to verify that a symbolic pic displacement
12401 includes the pic_offset_table_rtx register.
12403 While this is good idea, unfortunately these constructs may
12404 be created by "adds using lea" optimization for incorrect
12405 code like:
12407 int a;
12408 int foo(int i)
12410 return *(&a+i);
12413 This code is nonsensical, but results in addressing
12414 GOT table with pic_offset_table_rtx base. We can't
12415 just refuse it easily, since it gets matched by
12416 "addsi3" pattern, that later gets split to lea in the
12417 case output register differs from input. While this
12418 can be handled by separate addsi pattern for this case
12419 that never results in lea, this seems to be easier and
12420 correct fix for crash to disable this test. */
12422 else if (GET_CODE (disp) != LABEL_REF
12423 && !CONST_INT_P (disp)
12424 && (GET_CODE (disp) != CONST
12425 || !ix86_legitimate_constant_p (Pmode, disp))
12426 && (GET_CODE (disp) != SYMBOL_REF
12427 || !ix86_legitimate_constant_p (Pmode, disp)))
12428 /* Displacement is not constant. */
12429 return false;
12430 else if (TARGET_64BIT
12431 && !x86_64_immediate_operand (disp, VOIDmode))
12432 /* Displacement is out of range. */
12433 return false;
12436 /* Everything looks valid. */
12437 return true;
12440 /* Determine if a given RTX is a valid constant address. */
12442 bool
12443 constant_address_p (rtx x)
12445 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12448 /* Return a unique alias set for the GOT. */
12450 static alias_set_type
12451 ix86_GOT_alias_set (void)
12453 static alias_set_type set = -1;
12454 if (set == -1)
12455 set = new_alias_set ();
12456 return set;
12459 /* Return a legitimate reference for ORIG (an address) using the
12460 register REG. If REG is 0, a new pseudo is generated.
12462 There are two types of references that must be handled:
12464 1. Global data references must load the address from the GOT, via
12465 the PIC reg. An insn is emitted to do this load, and the reg is
12466 returned.
12468 2. Static data references, constant pool addresses, and code labels
12469 compute the address as an offset from the GOT, whose base is in
12470 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
12471 differentiate them from global data objects. The returned
12472 address is the PIC reg + an unspec constant.
12474 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12475 reg also appears in the address. */
12477 static rtx
12478 legitimize_pic_address (rtx orig, rtx reg)
12480 rtx addr = orig;
12481 rtx new_rtx = orig;
12483 #if TARGET_MACHO
12484 if (TARGET_MACHO && !TARGET_64BIT)
12486 if (reg == 0)
12487 reg = gen_reg_rtx (Pmode);
12488 /* Use the generic Mach-O PIC machinery. */
12489 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12491 #endif
12493 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12494 new_rtx = addr;
12495 else if (TARGET_64BIT
12496 && ix86_cmodel != CM_SMALL_PIC
12497 && gotoff_operand (addr, Pmode))
12499 rtx tmpreg;
12500 /* This symbol may be referenced via a displacement from the PIC
12501 base address (@GOTOFF). */
12503 if (reload_in_progress)
12504 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12505 if (GET_CODE (addr) == CONST)
12506 addr = XEXP (addr, 0);
12507 if (GET_CODE (addr) == PLUS)
12509 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12510 UNSPEC_GOTOFF);
12511 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12513 else
12514 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12515 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12516 if (!reg)
12517 tmpreg = gen_reg_rtx (Pmode);
12518 else
12519 tmpreg = reg;
12520 emit_move_insn (tmpreg, new_rtx);
12522 if (reg != 0)
12524 new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12525 tmpreg, 1, OPTAB_DIRECT);
12526 new_rtx = reg;
12528 else new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
12530 else if (!TARGET_64BIT && gotoff_operand (addr, Pmode))
12532 /* This symbol may be referenced via a displacement from the PIC
12533 base address (@GOTOFF). */
12535 if (reload_in_progress)
12536 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12537 if (GET_CODE (addr) == CONST)
12538 addr = XEXP (addr, 0);
12539 if (GET_CODE (addr) == PLUS)
12541 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12542 UNSPEC_GOTOFF);
12543 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12545 else
12546 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12547 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12548 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12550 if (reg != 0)
12552 emit_move_insn (reg, new_rtx);
12553 new_rtx = reg;
12556 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
12557 /* We can't use @GOTOFF for text labels on VxWorks;
12558 see gotoff_operand. */
12559 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
12561 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12563 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
12564 return legitimize_dllimport_symbol (addr, true);
12565 if (GET_CODE (addr) == CONST && GET_CODE (XEXP (addr, 0)) == PLUS
12566 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
12567 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
12569 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), true);
12570 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
12574 /* For x64 PE-COFF there is no GOT table. So we use address
12575 directly. */
12576 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
12578 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
12579 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12581 if (reg == 0)
12582 reg = gen_reg_rtx (Pmode);
12583 emit_move_insn (reg, new_rtx);
12584 new_rtx = reg;
12586 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
12588 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
12589 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12590 new_rtx = gen_const_mem (Pmode, new_rtx);
12591 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12593 if (reg == 0)
12594 reg = gen_reg_rtx (Pmode);
12595 /* Use directly gen_movsi, otherwise the address is loaded
12596 into register for CSE. We don't want to CSE this addresses,
12597 instead we CSE addresses from the GOT table, so skip this. */
12598 emit_insn (gen_movsi (reg, new_rtx));
12599 new_rtx = reg;
12601 else
12603 /* This symbol must be referenced via a load from the
12604 Global Offset Table (@GOT). */
12606 if (reload_in_progress)
12607 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12608 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
12609 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12610 if (TARGET_64BIT)
12611 new_rtx = force_reg (Pmode, new_rtx);
12612 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12613 new_rtx = gen_const_mem (Pmode, new_rtx);
12614 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12616 if (reg == 0)
12617 reg = gen_reg_rtx (Pmode);
12618 emit_move_insn (reg, new_rtx);
12619 new_rtx = reg;
12622 else
12624 if (CONST_INT_P (addr)
12625 && !x86_64_immediate_operand (addr, VOIDmode))
12627 if (reg)
12629 emit_move_insn (reg, addr);
12630 new_rtx = reg;
12632 else
12633 new_rtx = force_reg (Pmode, addr);
12635 else if (GET_CODE (addr) == CONST)
12637 addr = XEXP (addr, 0);
12639 /* We must match stuff we generate before. Assume the only
12640 unspecs that can get here are ours. Not that we could do
12641 anything with them anyway.... */
12642 if (GET_CODE (addr) == UNSPEC
12643 || (GET_CODE (addr) == PLUS
12644 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
12645 return orig;
12646 gcc_assert (GET_CODE (addr) == PLUS);
12648 if (GET_CODE (addr) == PLUS)
12650 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
12652 /* Check first to see if this is a constant offset from a @GOTOFF
12653 symbol reference. */
12654 if (gotoff_operand (op0, Pmode)
12655 && CONST_INT_P (op1))
12657 if (!TARGET_64BIT)
12659 if (reload_in_progress)
12660 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12661 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
12662 UNSPEC_GOTOFF);
12663 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
12664 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12665 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12667 if (reg != 0)
12669 emit_move_insn (reg, new_rtx);
12670 new_rtx = reg;
12673 else
12675 if (INTVAL (op1) < -16*1024*1024
12676 || INTVAL (op1) >= 16*1024*1024)
12678 if (!x86_64_immediate_operand (op1, Pmode))
12679 op1 = force_reg (Pmode, op1);
12680 new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
12684 else
12686 rtx base = legitimize_pic_address (op0, reg);
12687 enum machine_mode mode = GET_MODE (base);
12688 new_rtx
12689 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
12691 if (CONST_INT_P (new_rtx))
12693 if (INTVAL (new_rtx) < -16*1024*1024
12694 || INTVAL (new_rtx) >= 16*1024*1024)
12696 if (!x86_64_immediate_operand (new_rtx, mode))
12697 new_rtx = force_reg (mode, new_rtx);
12698 new_rtx
12699 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
12701 else
12702 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
12704 else
12706 if (GET_CODE (new_rtx) == PLUS
12707 && CONSTANT_P (XEXP (new_rtx, 1)))
12709 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
12710 new_rtx = XEXP (new_rtx, 1);
12712 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
12717 return new_rtx;
12720 /* Load the thread pointer. If TO_REG is true, force it into a register. */
12722 static rtx
12723 get_thread_pointer (enum machine_mode tp_mode, bool to_reg)
12725 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
12727 if (GET_MODE (tp) != tp_mode)
12729 gcc_assert (GET_MODE (tp) == SImode);
12730 gcc_assert (tp_mode == DImode);
12732 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
12735 if (to_reg)
12736 tp = copy_to_mode_reg (tp_mode, tp);
12738 return tp;
12741 /* Construct the SYMBOL_REF for the tls_get_addr function. */
12743 static GTY(()) rtx ix86_tls_symbol;
12745 static rtx
12746 ix86_tls_get_addr (void)
12748 if (!ix86_tls_symbol)
12750 const char *sym
12751 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
12752 ? "___tls_get_addr" : "__tls_get_addr");
12754 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
12757 return ix86_tls_symbol;
12760 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
12762 static GTY(()) rtx ix86_tls_module_base_symbol;
12765 ix86_tls_module_base (void)
12767 if (!ix86_tls_module_base_symbol)
12769 ix86_tls_module_base_symbol
12770 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
12772 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
12773 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
12776 return ix86_tls_module_base_symbol;
12779 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
12780 false if we expect this to be used for a memory address and true if
12781 we expect to load the address into a register. */
12783 static rtx
12784 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
12786 rtx dest, base, off;
12787 rtx pic = NULL_RTX, tp = NULL_RTX;
12788 enum machine_mode tp_mode = Pmode;
12789 int type;
12791 switch (model)
12793 case TLS_MODEL_GLOBAL_DYNAMIC:
12794 dest = gen_reg_rtx (Pmode);
12796 if (!TARGET_64BIT)
12798 if (flag_pic)
12799 pic = pic_offset_table_rtx;
12800 else
12802 pic = gen_reg_rtx (Pmode);
12803 emit_insn (gen_set_got (pic));
12807 if (TARGET_GNU2_TLS)
12809 if (TARGET_64BIT)
12810 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
12811 else
12812 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
12814 tp = get_thread_pointer (Pmode, true);
12815 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
12817 if (GET_MODE (x) != Pmode)
12818 x = gen_rtx_ZERO_EXTEND (Pmode, x);
12820 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12822 else
12824 rtx caddr = ix86_tls_get_addr ();
12826 if (TARGET_64BIT)
12828 rtx rax = gen_rtx_REG (Pmode, AX_REG);
12829 rtx insns;
12831 start_sequence ();
12832 emit_call_insn
12833 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
12834 insns = get_insns ();
12835 end_sequence ();
12837 if (GET_MODE (x) != Pmode)
12838 x = gen_rtx_ZERO_EXTEND (Pmode, x);
12840 RTL_CONST_CALL_P (insns) = 1;
12841 emit_libcall_block (insns, dest, rax, x);
12843 else
12844 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
12846 break;
12848 case TLS_MODEL_LOCAL_DYNAMIC:
12849 base = gen_reg_rtx (Pmode);
12851 if (!TARGET_64BIT)
12853 if (flag_pic)
12854 pic = pic_offset_table_rtx;
12855 else
12857 pic = gen_reg_rtx (Pmode);
12858 emit_insn (gen_set_got (pic));
12862 if (TARGET_GNU2_TLS)
12864 rtx tmp = ix86_tls_module_base ();
12866 if (TARGET_64BIT)
12867 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
12868 else
12869 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
12871 tp = get_thread_pointer (Pmode, true);
12872 set_unique_reg_note (get_last_insn (), REG_EQUAL,
12873 gen_rtx_MINUS (Pmode, tmp, tp));
12875 else
12877 rtx caddr = ix86_tls_get_addr ();
12879 if (TARGET_64BIT)
12881 rtx rax = gen_rtx_REG (Pmode, AX_REG);
12882 rtx insns, eqv;
12884 start_sequence ();
12885 emit_call_insn
12886 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
12887 insns = get_insns ();
12888 end_sequence ();
12890 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
12891 share the LD_BASE result with other LD model accesses. */
12892 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
12893 UNSPEC_TLS_LD_BASE);
12895 RTL_CONST_CALL_P (insns) = 1;
12896 emit_libcall_block (insns, base, rax, eqv);
12898 else
12899 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
12902 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
12903 off = gen_rtx_CONST (Pmode, off);
12905 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
12907 if (TARGET_GNU2_TLS)
12909 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
12911 if (GET_MODE (x) != Pmode)
12912 x = gen_rtx_ZERO_EXTEND (Pmode, x);
12914 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12916 break;
12918 case TLS_MODEL_INITIAL_EXEC:
12919 if (TARGET_64BIT)
12921 if (TARGET_SUN_TLS && !TARGET_X32)
12923 /* The Sun linker took the AMD64 TLS spec literally
12924 and can only handle %rax as destination of the
12925 initial executable code sequence. */
12927 dest = gen_reg_rtx (DImode);
12928 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
12929 return dest;
12932 /* Generate DImode references to avoid %fs:(%reg32)
12933 problems and linker IE->LE relaxation bug. */
12934 tp_mode = DImode;
12935 pic = NULL;
12936 type = UNSPEC_GOTNTPOFF;
12938 else if (flag_pic)
12940 if (reload_in_progress)
12941 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12942 pic = pic_offset_table_rtx;
12943 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
12945 else if (!TARGET_ANY_GNU_TLS)
12947 pic = gen_reg_rtx (Pmode);
12948 emit_insn (gen_set_got (pic));
12949 type = UNSPEC_GOTTPOFF;
12951 else
12953 pic = NULL;
12954 type = UNSPEC_INDNTPOFF;
12957 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
12958 off = gen_rtx_CONST (tp_mode, off);
12959 if (pic)
12960 off = gen_rtx_PLUS (tp_mode, pic, off);
12961 off = gen_const_mem (tp_mode, off);
12962 set_mem_alias_set (off, ix86_GOT_alias_set ());
12964 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12966 base = get_thread_pointer (tp_mode,
12967 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12968 off = force_reg (tp_mode, off);
12969 return gen_rtx_PLUS (tp_mode, base, off);
12971 else
12973 base = get_thread_pointer (Pmode, true);
12974 dest = gen_reg_rtx (Pmode);
12975 emit_insn (ix86_gen_sub3 (dest, base, off));
12977 break;
12979 case TLS_MODEL_LOCAL_EXEC:
12980 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
12981 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12982 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
12983 off = gen_rtx_CONST (Pmode, off);
12985 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12987 base = get_thread_pointer (Pmode,
12988 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12989 return gen_rtx_PLUS (Pmode, base, off);
12991 else
12993 base = get_thread_pointer (Pmode, true);
12994 dest = gen_reg_rtx (Pmode);
12995 emit_insn (ix86_gen_sub3 (dest, base, off));
12997 break;
12999 default:
13000 gcc_unreachable ();
13003 return dest;
13006 /* Create or return the unique __imp_DECL dllimport symbol corresponding
13007 to symbol DECL. */
13009 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
13010 htab_t dllimport_map;
13012 static tree
13013 get_dllimport_decl (tree decl)
13015 struct tree_map *h, in;
13016 void **loc;
13017 const char *name;
13018 const char *prefix;
13019 size_t namelen, prefixlen;
13020 char *imp_name;
13021 tree to;
13022 rtx rtl;
13024 if (!dllimport_map)
13025 dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
13027 in.hash = htab_hash_pointer (decl);
13028 in.base.from = decl;
13029 loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
13030 h = (struct tree_map *) *loc;
13031 if (h)
13032 return h->to;
13034 *loc = h = ggc_alloc_tree_map ();
13035 h->hash = in.hash;
13036 h->base.from = decl;
13037 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
13038 VAR_DECL, NULL, ptr_type_node);
13039 DECL_ARTIFICIAL (to) = 1;
13040 DECL_IGNORED_P (to) = 1;
13041 DECL_EXTERNAL (to) = 1;
13042 TREE_READONLY (to) = 1;
13044 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
13045 name = targetm.strip_name_encoding (name);
13046 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
13047 ? "*__imp_" : "*__imp__";
13048 namelen = strlen (name);
13049 prefixlen = strlen (prefix);
13050 imp_name = (char *) alloca (namelen + prefixlen + 1);
13051 memcpy (imp_name, prefix, prefixlen);
13052 memcpy (imp_name + prefixlen, name, namelen + 1);
13054 name = ggc_alloc_string (imp_name, namelen + prefixlen);
13055 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
13056 SET_SYMBOL_REF_DECL (rtl, to);
13057 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL;
13059 rtl = gen_const_mem (Pmode, rtl);
13060 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
13062 SET_DECL_RTL (to, rtl);
13063 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
13065 return to;
13068 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
13069 true if we require the result be a register. */
13071 static rtx
13072 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
13074 tree imp_decl;
13075 rtx x;
13077 gcc_assert (SYMBOL_REF_DECL (symbol));
13078 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol));
13080 x = DECL_RTL (imp_decl);
13081 if (want_reg)
13082 x = force_reg (Pmode, x);
13083 return x;
13086 /* Try machine-dependent ways of modifying an illegitimate address
13087 to be legitimate. If we find one, return the new, valid address.
13088 This macro is used in only one place: `memory_address' in explow.c.
13090 OLDX is the address as it was before break_out_memory_refs was called.
13091 In some cases it is useful to look at this to decide what needs to be done.
13093 It is always safe for this macro to do nothing. It exists to recognize
13094 opportunities to optimize the output.
13096 For the 80386, we handle X+REG by loading X into a register R and
13097 using R+REG. R will go in a general reg and indexing will be used.
13098 However, if REG is a broken-out memory address or multiplication,
13099 nothing needs to be done because REG can certainly go in a general reg.
13101 When -fpic is used, special handling is needed for symbolic references.
13102 See comments by legitimize_pic_address in i386.c for details. */
13104 static rtx
13105 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
13106 enum machine_mode mode)
13108 int changed = 0;
13109 unsigned log;
13111 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
13112 if (log)
13113 return legitimize_tls_address (x, (enum tls_model) log, false);
13114 if (GET_CODE (x) == CONST
13115 && GET_CODE (XEXP (x, 0)) == PLUS
13116 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13117 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
13119 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
13120 (enum tls_model) log, false);
13121 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13124 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
13126 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (x))
13127 return legitimize_dllimport_symbol (x, true);
13128 if (GET_CODE (x) == CONST
13129 && GET_CODE (XEXP (x, 0)) == PLUS
13130 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13131 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (x, 0), 0)))
13133 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (x, 0), 0), true);
13134 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13138 if (flag_pic && SYMBOLIC_CONST (x))
13139 return legitimize_pic_address (x, 0);
13141 #if TARGET_MACHO
13142 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
13143 return machopic_indirect_data_reference (x, 0);
13144 #endif
13146 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
13147 if (GET_CODE (x) == ASHIFT
13148 && CONST_INT_P (XEXP (x, 1))
13149 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
13151 changed = 1;
13152 log = INTVAL (XEXP (x, 1));
13153 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
13154 GEN_INT (1 << log));
13157 if (GET_CODE (x) == PLUS)
13159 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
13161 if (GET_CODE (XEXP (x, 0)) == ASHIFT
13162 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13163 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
13165 changed = 1;
13166 log = INTVAL (XEXP (XEXP (x, 0), 1));
13167 XEXP (x, 0) = gen_rtx_MULT (Pmode,
13168 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
13169 GEN_INT (1 << log));
13172 if (GET_CODE (XEXP (x, 1)) == ASHIFT
13173 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
13174 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
13176 changed = 1;
13177 log = INTVAL (XEXP (XEXP (x, 1), 1));
13178 XEXP (x, 1) = gen_rtx_MULT (Pmode,
13179 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
13180 GEN_INT (1 << log));
13183 /* Put multiply first if it isn't already. */
13184 if (GET_CODE (XEXP (x, 1)) == MULT)
13186 rtx tmp = XEXP (x, 0);
13187 XEXP (x, 0) = XEXP (x, 1);
13188 XEXP (x, 1) = tmp;
13189 changed = 1;
13192 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
13193 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
13194 created by virtual register instantiation, register elimination, and
13195 similar optimizations. */
13196 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
13198 changed = 1;
13199 x = gen_rtx_PLUS (Pmode,
13200 gen_rtx_PLUS (Pmode, XEXP (x, 0),
13201 XEXP (XEXP (x, 1), 0)),
13202 XEXP (XEXP (x, 1), 1));
13205 /* Canonicalize
13206 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
13207 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
13208 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
13209 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
13210 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
13211 && CONSTANT_P (XEXP (x, 1)))
13213 rtx constant;
13214 rtx other = NULL_RTX;
13216 if (CONST_INT_P (XEXP (x, 1)))
13218 constant = XEXP (x, 1);
13219 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
13221 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
13223 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
13224 other = XEXP (x, 1);
13226 else
13227 constant = 0;
13229 if (constant)
13231 changed = 1;
13232 x = gen_rtx_PLUS (Pmode,
13233 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
13234 XEXP (XEXP (XEXP (x, 0), 1), 0)),
13235 plus_constant (Pmode, other,
13236 INTVAL (constant)));
13240 if (changed && ix86_legitimate_address_p (mode, x, false))
13241 return x;
13243 if (GET_CODE (XEXP (x, 0)) == MULT)
13245 changed = 1;
13246 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
13249 if (GET_CODE (XEXP (x, 1)) == MULT)
13251 changed = 1;
13252 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
13255 if (changed
13256 && REG_P (XEXP (x, 1))
13257 && REG_P (XEXP (x, 0)))
13258 return x;
13260 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
13262 changed = 1;
13263 x = legitimize_pic_address (x, 0);
13266 if (changed && ix86_legitimate_address_p (mode, x, false))
13267 return x;
13269 if (REG_P (XEXP (x, 0)))
13271 rtx temp = gen_reg_rtx (Pmode);
13272 rtx val = force_operand (XEXP (x, 1), temp);
13273 if (val != temp)
13275 val = convert_to_mode (Pmode, val, 1);
13276 emit_move_insn (temp, val);
13279 XEXP (x, 1) = temp;
13280 return x;
13283 else if (REG_P (XEXP (x, 1)))
13285 rtx temp = gen_reg_rtx (Pmode);
13286 rtx val = force_operand (XEXP (x, 0), temp);
13287 if (val != temp)
13289 val = convert_to_mode (Pmode, val, 1);
13290 emit_move_insn (temp, val);
13293 XEXP (x, 0) = temp;
13294 return x;
13298 return x;
13301 /* Print an integer constant expression in assembler syntax. Addition
13302 and subtraction are the only arithmetic that may appear in these
13303 expressions. FILE is the stdio stream to write to, X is the rtx, and
13304 CODE is the operand print code from the output string. */
13306 static void
13307 output_pic_addr_const (FILE *file, rtx x, int code)
13309 char buf[256];
13311 switch (GET_CODE (x))
13313 case PC:
13314 gcc_assert (flag_pic);
13315 putc ('.', file);
13316 break;
13318 case SYMBOL_REF:
13319 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
13320 output_addr_const (file, x);
13321 else
13323 const char *name = XSTR (x, 0);
13325 /* Mark the decl as referenced so that cgraph will
13326 output the function. */
13327 if (SYMBOL_REF_DECL (x))
13328 mark_decl_referenced (SYMBOL_REF_DECL (x));
13330 #if TARGET_MACHO
13331 if (MACHOPIC_INDIRECT
13332 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
13333 name = machopic_indirection_name (x, /*stub_p=*/true);
13334 #endif
13335 assemble_name (file, name);
13337 if (!TARGET_MACHO && !(TARGET_64BIT && DEFAULT_ABI == MS_ABI)
13338 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
13339 fputs ("@PLT", file);
13340 break;
13342 case LABEL_REF:
13343 x = XEXP (x, 0);
13344 /* FALLTHRU */
13345 case CODE_LABEL:
13346 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
13347 assemble_name (asm_out_file, buf);
13348 break;
13350 case CONST_INT:
13351 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
13352 break;
13354 case CONST:
13355 /* This used to output parentheses around the expression,
13356 but that does not work on the 386 (either ATT or BSD assembler). */
13357 output_pic_addr_const (file, XEXP (x, 0), code);
13358 break;
13360 case CONST_DOUBLE:
13361 if (GET_MODE (x) == VOIDmode)
13363 /* We can use %d if the number is <32 bits and positive. */
13364 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
13365 fprintf (file, "0x%lx%08lx",
13366 (unsigned long) CONST_DOUBLE_HIGH (x),
13367 (unsigned long) CONST_DOUBLE_LOW (x));
13368 else
13369 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
13371 else
13372 /* We can't handle floating point constants;
13373 TARGET_PRINT_OPERAND must handle them. */
13374 output_operand_lossage ("floating constant misused");
13375 break;
13377 case PLUS:
13378 /* Some assemblers need integer constants to appear first. */
13379 if (CONST_INT_P (XEXP (x, 0)))
13381 output_pic_addr_const (file, XEXP (x, 0), code);
13382 putc ('+', file);
13383 output_pic_addr_const (file, XEXP (x, 1), code);
13385 else
13387 gcc_assert (CONST_INT_P (XEXP (x, 1)));
13388 output_pic_addr_const (file, XEXP (x, 1), code);
13389 putc ('+', file);
13390 output_pic_addr_const (file, XEXP (x, 0), code);
13392 break;
13394 case MINUS:
13395 if (!TARGET_MACHO)
13396 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
13397 output_pic_addr_const (file, XEXP (x, 0), code);
13398 putc ('-', file);
13399 output_pic_addr_const (file, XEXP (x, 1), code);
13400 if (!TARGET_MACHO)
13401 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
13402 break;
13404 case UNSPEC:
13405 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
13407 bool f = i386_asm_output_addr_const_extra (file, x);
13408 gcc_assert (f);
13409 break;
13412 gcc_assert (XVECLEN (x, 0) == 1);
13413 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
13414 switch (XINT (x, 1))
13416 case UNSPEC_GOT:
13417 fputs ("@GOT", file);
13418 break;
13419 case UNSPEC_GOTOFF:
13420 fputs ("@GOTOFF", file);
13421 break;
13422 case UNSPEC_PLTOFF:
13423 fputs ("@PLTOFF", file);
13424 break;
13425 case UNSPEC_PCREL:
13426 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13427 "(%rip)" : "[rip]", file);
13428 break;
13429 case UNSPEC_GOTPCREL:
13430 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13431 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
13432 break;
13433 case UNSPEC_GOTTPOFF:
13434 /* FIXME: This might be @TPOFF in Sun ld too. */
13435 fputs ("@gottpoff", file);
13436 break;
13437 case UNSPEC_TPOFF:
13438 fputs ("@tpoff", file);
13439 break;
13440 case UNSPEC_NTPOFF:
13441 if (TARGET_64BIT)
13442 fputs ("@tpoff", file);
13443 else
13444 fputs ("@ntpoff", file);
13445 break;
13446 case UNSPEC_DTPOFF:
13447 fputs ("@dtpoff", file);
13448 break;
13449 case UNSPEC_GOTNTPOFF:
13450 if (TARGET_64BIT)
13451 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13452 "@gottpoff(%rip)": "@gottpoff[rip]", file);
13453 else
13454 fputs ("@gotntpoff", file);
13455 break;
13456 case UNSPEC_INDNTPOFF:
13457 fputs ("@indntpoff", file);
13458 break;
13459 #if TARGET_MACHO
13460 case UNSPEC_MACHOPIC_OFFSET:
13461 putc ('-', file);
13462 machopic_output_function_base_name (file);
13463 break;
13464 #endif
13465 default:
13466 output_operand_lossage ("invalid UNSPEC as operand");
13467 break;
13469 break;
13471 default:
13472 output_operand_lossage ("invalid expression as operand");
13476 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
13477 We need to emit DTP-relative relocations. */
13479 static void ATTRIBUTE_UNUSED
13480 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
13482 fputs (ASM_LONG, file);
13483 output_addr_const (file, x);
13484 fputs ("@dtpoff", file);
13485 switch (size)
13487 case 4:
13488 break;
13489 case 8:
13490 fputs (", 0", file);
13491 break;
13492 default:
13493 gcc_unreachable ();
13497 /* Return true if X is a representation of the PIC register. This copes
13498 with calls from ix86_find_base_term, where the register might have
13499 been replaced by a cselib value. */
13501 static bool
13502 ix86_pic_register_p (rtx x)
13504 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
13505 return (pic_offset_table_rtx
13506 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
13507 else
13508 return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
13511 /* Helper function for ix86_delegitimize_address.
13512 Attempt to delegitimize TLS local-exec accesses. */
13514 static rtx
13515 ix86_delegitimize_tls_address (rtx orig_x)
13517 rtx x = orig_x, unspec;
13518 struct ix86_address addr;
13520 if (!TARGET_TLS_DIRECT_SEG_REFS)
13521 return orig_x;
13522 if (MEM_P (x))
13523 x = XEXP (x, 0);
13524 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
13525 return orig_x;
13526 if (ix86_decompose_address (x, &addr) == 0
13527 || addr.seg != (TARGET_64BIT ? SEG_FS : SEG_GS)
13528 || addr.disp == NULL_RTX
13529 || GET_CODE (addr.disp) != CONST)
13530 return orig_x;
13531 unspec = XEXP (addr.disp, 0);
13532 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
13533 unspec = XEXP (unspec, 0);
13534 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
13535 return orig_x;
13536 x = XVECEXP (unspec, 0, 0);
13537 gcc_assert (GET_CODE (x) == SYMBOL_REF);
13538 if (unspec != XEXP (addr.disp, 0))
13539 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
13540 if (addr.index)
13542 rtx idx = addr.index;
13543 if (addr.scale != 1)
13544 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
13545 x = gen_rtx_PLUS (Pmode, idx, x);
13547 if (addr.base)
13548 x = gen_rtx_PLUS (Pmode, addr.base, x);
13549 if (MEM_P (orig_x))
13550 x = replace_equiv_address_nv (orig_x, x);
13551 return x;
13554 /* In the name of slightly smaller debug output, and to cater to
13555 general assembler lossage, recognize PIC+GOTOFF and turn it back
13556 into a direct symbol reference.
13558 On Darwin, this is necessary to avoid a crash, because Darwin
13559 has a different PIC label for each routine but the DWARF debugging
13560 information is not associated with any particular routine, so it's
13561 necessary to remove references to the PIC label from RTL stored by
13562 the DWARF output code. */
13564 static rtx
13565 ix86_delegitimize_address (rtx x)
13567 rtx orig_x = delegitimize_mem_from_attrs (x);
13568 /* addend is NULL or some rtx if x is something+GOTOFF where
13569 something doesn't include the PIC register. */
13570 rtx addend = NULL_RTX;
13571 /* reg_addend is NULL or a multiple of some register. */
13572 rtx reg_addend = NULL_RTX;
13573 /* const_addend is NULL or a const_int. */
13574 rtx const_addend = NULL_RTX;
13575 /* This is the result, or NULL. */
13576 rtx result = NULL_RTX;
13578 x = orig_x;
13580 if (MEM_P (x))
13581 x = XEXP (x, 0);
13583 if (TARGET_64BIT)
13585 if (GET_CODE (x) == CONST
13586 && GET_CODE (XEXP (x, 0)) == PLUS
13587 && GET_MODE (XEXP (x, 0)) == Pmode
13588 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13589 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
13590 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
13592 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
13593 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
13594 if (MEM_P (orig_x))
13595 x = replace_equiv_address_nv (orig_x, x);
13596 return x;
13598 if (GET_CODE (x) != CONST
13599 || GET_CODE (XEXP (x, 0)) != UNSPEC
13600 || (XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL
13601 && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL)
13602 || (!MEM_P (orig_x) && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL))
13603 return ix86_delegitimize_tls_address (orig_x);
13604 x = XVECEXP (XEXP (x, 0), 0, 0);
13605 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
13607 x = simplify_gen_subreg (GET_MODE (orig_x), x,
13608 GET_MODE (x), 0);
13609 if (x == NULL_RTX)
13610 return orig_x;
13612 return x;
13615 if (GET_CODE (x) != PLUS
13616 || GET_CODE (XEXP (x, 1)) != CONST)
13617 return ix86_delegitimize_tls_address (orig_x);
13619 if (ix86_pic_register_p (XEXP (x, 0)))
13620 /* %ebx + GOT/GOTOFF */
13622 else if (GET_CODE (XEXP (x, 0)) == PLUS)
13624 /* %ebx + %reg * scale + GOT/GOTOFF */
13625 reg_addend = XEXP (x, 0);
13626 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
13627 reg_addend = XEXP (reg_addend, 1);
13628 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
13629 reg_addend = XEXP (reg_addend, 0);
13630 else
13632 reg_addend = NULL_RTX;
13633 addend = XEXP (x, 0);
13636 else
13637 addend = XEXP (x, 0);
13639 x = XEXP (XEXP (x, 1), 0);
13640 if (GET_CODE (x) == PLUS
13641 && CONST_INT_P (XEXP (x, 1)))
13643 const_addend = XEXP (x, 1);
13644 x = XEXP (x, 0);
13647 if (GET_CODE (x) == UNSPEC
13648 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
13649 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))))
13650 result = XVECEXP (x, 0, 0);
13652 if (TARGET_MACHO && darwin_local_data_pic (x)
13653 && !MEM_P (orig_x))
13654 result = XVECEXP (x, 0, 0);
13656 if (! result)
13657 return ix86_delegitimize_tls_address (orig_x);
13659 if (const_addend)
13660 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
13661 if (reg_addend)
13662 result = gen_rtx_PLUS (Pmode, reg_addend, result);
13663 if (addend)
13665 /* If the rest of original X doesn't involve the PIC register, add
13666 addend and subtract pic_offset_table_rtx. This can happen e.g.
13667 for code like:
13668 leal (%ebx, %ecx, 4), %ecx
13670 movl foo@GOTOFF(%ecx), %edx
13671 in which case we return (%ecx - %ebx) + foo. */
13672 if (pic_offset_table_rtx)
13673 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
13674 pic_offset_table_rtx),
13675 result);
13676 else
13677 return orig_x;
13679 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
13681 result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
13682 if (result == NULL_RTX)
13683 return orig_x;
13685 return result;
13688 /* If X is a machine specific address (i.e. a symbol or label being
13689 referenced as a displacement from the GOT implemented using an
13690 UNSPEC), then return the base term. Otherwise return X. */
13693 ix86_find_base_term (rtx x)
13695 rtx term;
13697 if (TARGET_64BIT)
13699 if (GET_CODE (x) != CONST)
13700 return x;
13701 term = XEXP (x, 0);
13702 if (GET_CODE (term) == PLUS
13703 && (CONST_INT_P (XEXP (term, 1))
13704 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
13705 term = XEXP (term, 0);
13706 if (GET_CODE (term) != UNSPEC
13707 || (XINT (term, 1) != UNSPEC_GOTPCREL
13708 && XINT (term, 1) != UNSPEC_PCREL))
13709 return x;
13711 return XVECEXP (term, 0, 0);
13714 return ix86_delegitimize_address (x);
13717 static void
13718 put_condition_code (enum rtx_code code, enum machine_mode mode, bool reverse,
13719 bool fp, FILE *file)
13721 const char *suffix;
13723 if (mode == CCFPmode || mode == CCFPUmode)
13725 code = ix86_fp_compare_code_to_integer (code);
13726 mode = CCmode;
13728 if (reverse)
13729 code = reverse_condition (code);
13731 switch (code)
13733 case EQ:
13734 switch (mode)
13736 case CCAmode:
13737 suffix = "a";
13738 break;
13740 case CCCmode:
13741 suffix = "c";
13742 break;
13744 case CCOmode:
13745 suffix = "o";
13746 break;
13748 case CCSmode:
13749 suffix = "s";
13750 break;
13752 default:
13753 suffix = "e";
13755 break;
13756 case NE:
13757 switch (mode)
13759 case CCAmode:
13760 suffix = "na";
13761 break;
13763 case CCCmode:
13764 suffix = "nc";
13765 break;
13767 case CCOmode:
13768 suffix = "no";
13769 break;
13771 case CCSmode:
13772 suffix = "ns";
13773 break;
13775 default:
13776 suffix = "ne";
13778 break;
13779 case GT:
13780 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
13781 suffix = "g";
13782 break;
13783 case GTU:
13784 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
13785 Those same assemblers have the same but opposite lossage on cmov. */
13786 if (mode == CCmode)
13787 suffix = fp ? "nbe" : "a";
13788 else if (mode == CCCmode)
13789 suffix = "b";
13790 else
13791 gcc_unreachable ();
13792 break;
13793 case LT:
13794 switch (mode)
13796 case CCNOmode:
13797 case CCGOCmode:
13798 suffix = "s";
13799 break;
13801 case CCmode:
13802 case CCGCmode:
13803 suffix = "l";
13804 break;
13806 default:
13807 gcc_unreachable ();
13809 break;
13810 case LTU:
13811 gcc_assert (mode == CCmode || mode == CCCmode);
13812 suffix = "b";
13813 break;
13814 case GE:
13815 switch (mode)
13817 case CCNOmode:
13818 case CCGOCmode:
13819 suffix = "ns";
13820 break;
13822 case CCmode:
13823 case CCGCmode:
13824 suffix = "ge";
13825 break;
13827 default:
13828 gcc_unreachable ();
13830 break;
13831 case GEU:
13832 /* ??? As above. */
13833 gcc_assert (mode == CCmode || mode == CCCmode);
13834 suffix = fp ? "nb" : "ae";
13835 break;
13836 case LE:
13837 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
13838 suffix = "le";
13839 break;
13840 case LEU:
13841 /* ??? As above. */
13842 if (mode == CCmode)
13843 suffix = "be";
13844 else if (mode == CCCmode)
13845 suffix = fp ? "nb" : "ae";
13846 else
13847 gcc_unreachable ();
13848 break;
13849 case UNORDERED:
13850 suffix = fp ? "u" : "p";
13851 break;
13852 case ORDERED:
13853 suffix = fp ? "nu" : "np";
13854 break;
13855 default:
13856 gcc_unreachable ();
13858 fputs (suffix, file);
13861 /* Print the name of register X to FILE based on its machine mode and number.
13862 If CODE is 'w', pretend the mode is HImode.
13863 If CODE is 'b', pretend the mode is QImode.
13864 If CODE is 'k', pretend the mode is SImode.
13865 If CODE is 'q', pretend the mode is DImode.
13866 If CODE is 'x', pretend the mode is V4SFmode.
13867 If CODE is 't', pretend the mode is V8SFmode.
13868 If CODE is 'h', pretend the reg is the 'high' byte register.
13869 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
13870 If CODE is 'd', duplicate the operand for AVX instruction.
13873 void
13874 print_reg (rtx x, int code, FILE *file)
13876 const char *reg;
13877 unsigned int regno;
13878 bool duplicated = code == 'd' && TARGET_AVX;
13880 if (ASSEMBLER_DIALECT == ASM_ATT)
13881 putc ('%', file);
13883 if (x == pc_rtx)
13885 gcc_assert (TARGET_64BIT);
13886 fputs ("rip", file);
13887 return;
13890 regno = true_regnum (x);
13891 gcc_assert (regno != ARG_POINTER_REGNUM
13892 && regno != FRAME_POINTER_REGNUM
13893 && regno != FLAGS_REG
13894 && regno != FPSR_REG
13895 && regno != FPCR_REG);
13897 if (code == 'w' || MMX_REG_P (x))
13898 code = 2;
13899 else if (code == 'b')
13900 code = 1;
13901 else if (code == 'k')
13902 code = 4;
13903 else if (code == 'q')
13904 code = 8;
13905 else if (code == 'y')
13906 code = 3;
13907 else if (code == 'h')
13908 code = 0;
13909 else if (code == 'x')
13910 code = 16;
13911 else if (code == 't')
13912 code = 32;
13913 else
13914 code = GET_MODE_SIZE (GET_MODE (x));
13916 /* Irritatingly, AMD extended registers use different naming convention
13917 from the normal registers: "r%d[bwd]" */
13918 if (REX_INT_REGNO_P (regno))
13920 gcc_assert (TARGET_64BIT);
13921 putc ('r', file);
13922 fprint_ul (file, regno - FIRST_REX_INT_REG + 8);
13923 switch (code)
13925 case 0:
13926 error ("extended registers have no high halves");
13927 break;
13928 case 1:
13929 putc ('b', file);
13930 break;
13931 case 2:
13932 putc ('w', file);
13933 break;
13934 case 4:
13935 putc ('d', file);
13936 break;
13937 case 8:
13938 /* no suffix */
13939 break;
13940 default:
13941 error ("unsupported operand size for extended register");
13942 break;
13944 return;
13947 reg = NULL;
13948 switch (code)
13950 case 3:
13951 if (STACK_TOP_P (x))
13953 reg = "st(0)";
13954 break;
13956 /* FALLTHRU */
13957 case 8:
13958 case 4:
13959 case 12:
13960 if (! ANY_FP_REG_P (x))
13961 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
13962 /* FALLTHRU */
13963 case 16:
13964 case 2:
13965 normal:
13966 reg = hi_reg_name[regno];
13967 break;
13968 case 1:
13969 if (regno >= ARRAY_SIZE (qi_reg_name))
13970 goto normal;
13971 reg = qi_reg_name[regno];
13972 break;
13973 case 0:
13974 if (regno >= ARRAY_SIZE (qi_high_reg_name))
13975 goto normal;
13976 reg = qi_high_reg_name[regno];
13977 break;
13978 case 32:
13979 if (SSE_REG_P (x))
13981 gcc_assert (!duplicated);
13982 putc ('y', file);
13983 fputs (hi_reg_name[regno] + 1, file);
13984 return;
13986 break;
13987 default:
13988 gcc_unreachable ();
13991 fputs (reg, file);
13992 if (duplicated)
13994 if (ASSEMBLER_DIALECT == ASM_ATT)
13995 fprintf (file, ", %%%s", reg);
13996 else
13997 fprintf (file, ", %s", reg);
14001 /* Locate some local-dynamic symbol still in use by this function
14002 so that we can print its name in some tls_local_dynamic_base
14003 pattern. */
14005 static int
14006 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
14008 rtx x = *px;
14010 if (GET_CODE (x) == SYMBOL_REF
14011 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
14013 cfun->machine->some_ld_name = XSTR (x, 0);
14014 return 1;
14017 return 0;
14020 static const char *
14021 get_some_local_dynamic_name (void)
14023 rtx insn;
14025 if (cfun->machine->some_ld_name)
14026 return cfun->machine->some_ld_name;
14028 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
14029 if (NONDEBUG_INSN_P (insn)
14030 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
14031 return cfun->machine->some_ld_name;
14033 return NULL;
14036 /* Meaning of CODE:
14037 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
14038 C -- print opcode suffix for set/cmov insn.
14039 c -- like C, but print reversed condition
14040 F,f -- likewise, but for floating-point.
14041 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
14042 otherwise nothing
14043 R -- print the prefix for register names.
14044 z -- print the opcode suffix for the size of the current operand.
14045 Z -- likewise, with special suffixes for x87 instructions.
14046 * -- print a star (in certain assembler syntax)
14047 A -- print an absolute memory reference.
14048 E -- print address with DImode register names if TARGET_64BIT.
14049 w -- print the operand as if it's a "word" (HImode) even if it isn't.
14050 s -- print a shift double count, followed by the assemblers argument
14051 delimiter.
14052 b -- print the QImode name of the register for the indicated operand.
14053 %b0 would print %al if operands[0] is reg 0.
14054 w -- likewise, print the HImode name of the register.
14055 k -- likewise, print the SImode name of the register.
14056 q -- likewise, print the DImode name of the register.
14057 x -- likewise, print the V4SFmode name of the register.
14058 t -- likewise, print the V8SFmode name of the register.
14059 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
14060 y -- print "st(0)" instead of "st" as a register.
14061 d -- print duplicated register operand for AVX instruction.
14062 D -- print condition for SSE cmp instruction.
14063 P -- if PIC, print an @PLT suffix.
14064 p -- print raw symbol name.
14065 X -- don't print any sort of PIC '@' suffix for a symbol.
14066 & -- print some in-use local-dynamic symbol name.
14067 H -- print a memory address offset by 8; used for sse high-parts
14068 Y -- print condition for XOP pcom* instruction.
14069 + -- print a branch hint as 'cs' or 'ds' prefix
14070 ; -- print a semicolon (after prefixes due to bug in older gas).
14071 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
14072 @ -- print a segment register of thread base pointer load
14073 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
14076 void
14077 ix86_print_operand (FILE *file, rtx x, int code)
14079 if (code)
14081 switch (code)
14083 case 'A':
14084 switch (ASSEMBLER_DIALECT)
14086 case ASM_ATT:
14087 putc ('*', file);
14088 break;
14090 case ASM_INTEL:
14091 /* Intel syntax. For absolute addresses, registers should not
14092 be surrounded by braces. */
14093 if (!REG_P (x))
14095 putc ('[', file);
14096 ix86_print_operand (file, x, 0);
14097 putc (']', file);
14098 return;
14100 break;
14102 default:
14103 gcc_unreachable ();
14106 ix86_print_operand (file, x, 0);
14107 return;
14109 case 'E':
14110 /* Wrap address in an UNSPEC to declare special handling. */
14111 if (TARGET_64BIT)
14112 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
14114 output_address (x);
14115 return;
14117 case 'L':
14118 if (ASSEMBLER_DIALECT == ASM_ATT)
14119 putc ('l', file);
14120 return;
14122 case 'W':
14123 if (ASSEMBLER_DIALECT == ASM_ATT)
14124 putc ('w', file);
14125 return;
14127 case 'B':
14128 if (ASSEMBLER_DIALECT == ASM_ATT)
14129 putc ('b', file);
14130 return;
14132 case 'Q':
14133 if (ASSEMBLER_DIALECT == ASM_ATT)
14134 putc ('l', file);
14135 return;
14137 case 'S':
14138 if (ASSEMBLER_DIALECT == ASM_ATT)
14139 putc ('s', file);
14140 return;
14142 case 'T':
14143 if (ASSEMBLER_DIALECT == ASM_ATT)
14144 putc ('t', file);
14145 return;
14147 case 'O':
14148 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14149 if (ASSEMBLER_DIALECT != ASM_ATT)
14150 return;
14152 switch (GET_MODE_SIZE (GET_MODE (x)))
14154 case 2:
14155 putc ('w', file);
14156 break;
14158 case 4:
14159 putc ('l', file);
14160 break;
14162 case 8:
14163 putc ('q', file);
14164 break;
14166 default:
14167 output_operand_lossage
14168 ("invalid operand size for operand code 'O'");
14169 return;
14172 putc ('.', file);
14173 #endif
14174 return;
14176 case 'z':
14177 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14179 /* Opcodes don't get size suffixes if using Intel opcodes. */
14180 if (ASSEMBLER_DIALECT == ASM_INTEL)
14181 return;
14183 switch (GET_MODE_SIZE (GET_MODE (x)))
14185 case 1:
14186 putc ('b', file);
14187 return;
14189 case 2:
14190 putc ('w', file);
14191 return;
14193 case 4:
14194 putc ('l', file);
14195 return;
14197 case 8:
14198 putc ('q', file);
14199 return;
14201 default:
14202 output_operand_lossage
14203 ("invalid operand size for operand code 'z'");
14204 return;
14208 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14209 warning
14210 (0, "non-integer operand used with operand code 'z'");
14211 /* FALLTHRU */
14213 case 'Z':
14214 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
14215 if (ASSEMBLER_DIALECT == ASM_INTEL)
14216 return;
14218 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14220 switch (GET_MODE_SIZE (GET_MODE (x)))
14222 case 2:
14223 #ifdef HAVE_AS_IX86_FILDS
14224 putc ('s', file);
14225 #endif
14226 return;
14228 case 4:
14229 putc ('l', file);
14230 return;
14232 case 8:
14233 #ifdef HAVE_AS_IX86_FILDQ
14234 putc ('q', file);
14235 #else
14236 fputs ("ll", file);
14237 #endif
14238 return;
14240 default:
14241 break;
14244 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14246 /* 387 opcodes don't get size suffixes
14247 if the operands are registers. */
14248 if (STACK_REG_P (x))
14249 return;
14251 switch (GET_MODE_SIZE (GET_MODE (x)))
14253 case 4:
14254 putc ('s', file);
14255 return;
14257 case 8:
14258 putc ('l', file);
14259 return;
14261 case 12:
14262 case 16:
14263 putc ('t', file);
14264 return;
14266 default:
14267 break;
14270 else
14272 output_operand_lossage
14273 ("invalid operand type used with operand code 'Z'");
14274 return;
14277 output_operand_lossage
14278 ("invalid operand size for operand code 'Z'");
14279 return;
14281 case 'd':
14282 case 'b':
14283 case 'w':
14284 case 'k':
14285 case 'q':
14286 case 'h':
14287 case 't':
14288 case 'y':
14289 case 'x':
14290 case 'X':
14291 case 'P':
14292 case 'p':
14293 break;
14295 case 's':
14296 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
14298 ix86_print_operand (file, x, 0);
14299 fputs (", ", file);
14301 return;
14303 case 'Y':
14304 switch (GET_CODE (x))
14306 case NE:
14307 fputs ("neq", file);
14308 break;
14309 case EQ:
14310 fputs ("eq", file);
14311 break;
14312 case GE:
14313 case GEU:
14314 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14315 break;
14316 case GT:
14317 case GTU:
14318 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14319 break;
14320 case LE:
14321 case LEU:
14322 fputs ("le", file);
14323 break;
14324 case LT:
14325 case LTU:
14326 fputs ("lt", file);
14327 break;
14328 case UNORDERED:
14329 fputs ("unord", file);
14330 break;
14331 case ORDERED:
14332 fputs ("ord", file);
14333 break;
14334 case UNEQ:
14335 fputs ("ueq", file);
14336 break;
14337 case UNGE:
14338 fputs ("nlt", file);
14339 break;
14340 case UNGT:
14341 fputs ("nle", file);
14342 break;
14343 case UNLE:
14344 fputs ("ule", file);
14345 break;
14346 case UNLT:
14347 fputs ("ult", file);
14348 break;
14349 case LTGT:
14350 fputs ("une", file);
14351 break;
14352 default:
14353 output_operand_lossage ("operand is not a condition code, "
14354 "invalid operand code 'Y'");
14355 return;
14357 return;
14359 case 'D':
14360 /* Little bit of braindamage here. The SSE compare instructions
14361 does use completely different names for the comparisons that the
14362 fp conditional moves. */
14363 switch (GET_CODE (x))
14365 case UNEQ:
14366 if (TARGET_AVX)
14368 fputs ("eq_us", file);
14369 break;
14371 case EQ:
14372 fputs ("eq", file);
14373 break;
14374 case UNLT:
14375 if (TARGET_AVX)
14377 fputs ("nge", file);
14378 break;
14380 case LT:
14381 fputs ("lt", file);
14382 break;
14383 case UNLE:
14384 if (TARGET_AVX)
14386 fputs ("ngt", file);
14387 break;
14389 case LE:
14390 fputs ("le", file);
14391 break;
14392 case UNORDERED:
14393 fputs ("unord", file);
14394 break;
14395 case LTGT:
14396 if (TARGET_AVX)
14398 fputs ("neq_oq", file);
14399 break;
14401 case NE:
14402 fputs ("neq", file);
14403 break;
14404 case GE:
14405 if (TARGET_AVX)
14407 fputs ("ge", file);
14408 break;
14410 case UNGE:
14411 fputs ("nlt", file);
14412 break;
14413 case GT:
14414 if (TARGET_AVX)
14416 fputs ("gt", file);
14417 break;
14419 case UNGT:
14420 fputs ("nle", file);
14421 break;
14422 case ORDERED:
14423 fputs ("ord", file);
14424 break;
14425 default:
14426 output_operand_lossage ("operand is not a condition code, "
14427 "invalid operand code 'D'");
14428 return;
14430 return;
14432 case 'F':
14433 case 'f':
14434 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14435 if (ASSEMBLER_DIALECT == ASM_ATT)
14436 putc ('.', file);
14437 #endif
14439 case 'C':
14440 case 'c':
14441 if (!COMPARISON_P (x))
14443 output_operand_lossage ("operand is not a condition code, "
14444 "invalid operand code '%c'", code);
14445 return;
14447 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
14448 code == 'c' || code == 'f',
14449 code == 'F' || code == 'f',
14450 file);
14451 return;
14453 case 'H':
14454 if (!offsettable_memref_p (x))
14456 output_operand_lossage ("operand is not an offsettable memory "
14457 "reference, invalid operand code 'H'");
14458 return;
14460 /* It doesn't actually matter what mode we use here, as we're
14461 only going to use this for printing. */
14462 x = adjust_address_nv (x, DImode, 8);
14463 break;
14465 case 'K':
14466 gcc_assert (CONST_INT_P (x));
14468 if (INTVAL (x) & IX86_HLE_ACQUIRE)
14469 #ifdef HAVE_AS_IX86_HLE
14470 fputs ("xacquire ", file);
14471 #else
14472 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
14473 #endif
14474 else if (INTVAL (x) & IX86_HLE_RELEASE)
14475 #ifdef HAVE_AS_IX86_HLE
14476 fputs ("xrelease ", file);
14477 #else
14478 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
14479 #endif
14480 /* We do not want to print value of the operand. */
14481 return;
14483 case '*':
14484 if (ASSEMBLER_DIALECT == ASM_ATT)
14485 putc ('*', file);
14486 return;
14488 case '&':
14490 const char *name = get_some_local_dynamic_name ();
14491 if (name == NULL)
14492 output_operand_lossage ("'%%&' used without any "
14493 "local dynamic TLS references");
14494 else
14495 assemble_name (file, name);
14496 return;
14499 case '+':
14501 rtx x;
14503 if (!optimize
14504 || optimize_function_for_size_p (cfun)
14505 || !TARGET_BRANCH_PREDICTION_HINTS)
14506 return;
14508 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
14509 if (x)
14511 int pred_val = INTVAL (XEXP (x, 0));
14513 if (pred_val < REG_BR_PROB_BASE * 45 / 100
14514 || pred_val > REG_BR_PROB_BASE * 55 / 100)
14516 bool taken = pred_val > REG_BR_PROB_BASE / 2;
14517 bool cputaken
14518 = final_forward_branch_p (current_output_insn) == 0;
14520 /* Emit hints only in the case default branch prediction
14521 heuristics would fail. */
14522 if (taken != cputaken)
14524 /* We use 3e (DS) prefix for taken branches and
14525 2e (CS) prefix for not taken branches. */
14526 if (taken)
14527 fputs ("ds ; ", file);
14528 else
14529 fputs ("cs ; ", file);
14533 return;
14536 case ';':
14537 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
14538 putc (';', file);
14539 #endif
14540 return;
14542 case '@':
14543 if (ASSEMBLER_DIALECT == ASM_ATT)
14544 putc ('%', file);
14546 /* The kernel uses a different segment register for performance
14547 reasons; a system call would not have to trash the userspace
14548 segment register, which would be expensive. */
14549 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
14550 fputs ("fs", file);
14551 else
14552 fputs ("gs", file);
14553 return;
14555 case '~':
14556 putc (TARGET_AVX2 ? 'i' : 'f', file);
14557 return;
14559 case '^':
14560 if (TARGET_64BIT && Pmode != word_mode)
14561 fputs ("addr32 ", file);
14562 return;
14564 default:
14565 output_operand_lossage ("invalid operand code '%c'", code);
14569 if (REG_P (x))
14570 print_reg (x, code, file);
14572 else if (MEM_P (x))
14574 /* No `byte ptr' prefix for call instructions or BLKmode operands. */
14575 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
14576 && GET_MODE (x) != BLKmode)
14578 const char * size;
14579 switch (GET_MODE_SIZE (GET_MODE (x)))
14581 case 1: size = "BYTE"; break;
14582 case 2: size = "WORD"; break;
14583 case 4: size = "DWORD"; break;
14584 case 8: size = "QWORD"; break;
14585 case 12: size = "TBYTE"; break;
14586 case 16:
14587 if (GET_MODE (x) == XFmode)
14588 size = "TBYTE";
14589 else
14590 size = "XMMWORD";
14591 break;
14592 case 32: size = "YMMWORD"; break;
14593 default:
14594 gcc_unreachable ();
14597 /* Check for explicit size override (codes 'b', 'w', 'k',
14598 'q' and 'x') */
14599 if (code == 'b')
14600 size = "BYTE";
14601 else if (code == 'w')
14602 size = "WORD";
14603 else if (code == 'k')
14604 size = "DWORD";
14605 else if (code == 'q')
14606 size = "QWORD";
14607 else if (code == 'x')
14608 size = "XMMWORD";
14610 fputs (size, file);
14611 fputs (" PTR ", file);
14614 x = XEXP (x, 0);
14615 /* Avoid (%rip) for call operands. */
14616 if (CONSTANT_ADDRESS_P (x) && code == 'P'
14617 && !CONST_INT_P (x))
14618 output_addr_const (file, x);
14619 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
14620 output_operand_lossage ("invalid constraints for operand");
14621 else
14622 output_address (x);
14625 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
14627 REAL_VALUE_TYPE r;
14628 long l;
14630 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14631 REAL_VALUE_TO_TARGET_SINGLE (r, l);
14633 if (ASSEMBLER_DIALECT == ASM_ATT)
14634 putc ('$', file);
14635 /* Sign extend 32bit SFmode immediate to 8 bytes. */
14636 if (code == 'q')
14637 fprintf (file, "0x%08llx", (unsigned long long) (int) l);
14638 else
14639 fprintf (file, "0x%08x", (unsigned int) l);
14642 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
14644 REAL_VALUE_TYPE r;
14645 long l[2];
14647 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14648 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
14650 if (ASSEMBLER_DIALECT == ASM_ATT)
14651 putc ('$', file);
14652 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
14655 /* These float cases don't actually occur as immediate operands. */
14656 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
14658 char dstr[30];
14660 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
14661 fputs (dstr, file);
14664 else
14666 /* We have patterns that allow zero sets of memory, for instance.
14667 In 64-bit mode, we should probably support all 8-byte vectors,
14668 since we can in fact encode that into an immediate. */
14669 if (GET_CODE (x) == CONST_VECTOR)
14671 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
14672 x = const0_rtx;
14675 if (code != 'P' && code != 'p')
14677 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
14679 if (ASSEMBLER_DIALECT == ASM_ATT)
14680 putc ('$', file);
14682 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
14683 || GET_CODE (x) == LABEL_REF)
14685 if (ASSEMBLER_DIALECT == ASM_ATT)
14686 putc ('$', file);
14687 else
14688 fputs ("OFFSET FLAT:", file);
14691 if (CONST_INT_P (x))
14692 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
14693 else if (flag_pic || MACHOPIC_INDIRECT)
14694 output_pic_addr_const (file, x, code);
14695 else
14696 output_addr_const (file, x);
14700 static bool
14701 ix86_print_operand_punct_valid_p (unsigned char code)
14703 return (code == '@' || code == '*' || code == '+' || code == '&'
14704 || code == ';' || code == '~' || code == '^');
14707 /* Print a memory operand whose address is ADDR. */
14709 static void
14710 ix86_print_operand_address (FILE *file, rtx addr)
14712 struct ix86_address parts;
14713 rtx base, index, disp;
14714 int scale;
14715 int ok;
14716 bool vsib = false;
14717 int code = 0;
14719 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
14721 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14722 gcc_assert (parts.index == NULL_RTX);
14723 parts.index = XVECEXP (addr, 0, 1);
14724 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
14725 addr = XVECEXP (addr, 0, 0);
14726 vsib = true;
14728 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
14730 gcc_assert (TARGET_64BIT);
14731 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14732 code = 'q';
14734 else
14735 ok = ix86_decompose_address (addr, &parts);
14737 gcc_assert (ok);
14739 base = parts.base;
14740 index = parts.index;
14741 disp = parts.disp;
14742 scale = parts.scale;
14744 switch (parts.seg)
14746 case SEG_DEFAULT:
14747 break;
14748 case SEG_FS:
14749 case SEG_GS:
14750 if (ASSEMBLER_DIALECT == ASM_ATT)
14751 putc ('%', file);
14752 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
14753 break;
14754 default:
14755 gcc_unreachable ();
14758 /* Use one byte shorter RIP relative addressing for 64bit mode. */
14759 if (TARGET_64BIT && !base && !index)
14761 rtx symbol = disp;
14763 if (GET_CODE (disp) == CONST
14764 && GET_CODE (XEXP (disp, 0)) == PLUS
14765 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14766 symbol = XEXP (XEXP (disp, 0), 0);
14768 if (GET_CODE (symbol) == LABEL_REF
14769 || (GET_CODE (symbol) == SYMBOL_REF
14770 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
14771 base = pc_rtx;
14773 if (!base && !index)
14775 /* Displacement only requires special attention. */
14777 if (CONST_INT_P (disp))
14779 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
14780 fputs ("ds:", file);
14781 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
14783 else if (flag_pic)
14784 output_pic_addr_const (file, disp, 0);
14785 else
14786 output_addr_const (file, disp);
14788 else
14790 /* Print SImode register names to force addr32 prefix. */
14791 if (SImode_address_operand (addr, VOIDmode))
14793 #ifdef ENABLE_CHECKING
14794 gcc_assert (TARGET_64BIT);
14795 switch (GET_CODE (addr))
14797 case SUBREG:
14798 gcc_assert (GET_MODE (addr) == SImode);
14799 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
14800 break;
14801 case ZERO_EXTEND:
14802 case AND:
14803 gcc_assert (GET_MODE (addr) == DImode);
14804 break;
14805 default:
14806 gcc_unreachable ();
14808 #endif
14809 gcc_assert (!code);
14810 code = 'k';
14812 else if (code == 0
14813 && TARGET_X32
14814 && disp
14815 && CONST_INT_P (disp)
14816 && INTVAL (disp) < -16*1024*1024)
14818 /* X32 runs in 64-bit mode, where displacement, DISP, in
14819 address DISP(%r64), is encoded as 32-bit immediate sign-
14820 extended from 32-bit to 64-bit. For -0x40000300(%r64),
14821 address is %r64 + 0xffffffffbffffd00. When %r64 <
14822 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
14823 which is invalid for x32. The correct address is %r64
14824 - 0x40000300 == 0xf7ffdd64. To properly encode
14825 -0x40000300(%r64) for x32, we zero-extend negative
14826 displacement by forcing addr32 prefix which truncates
14827 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
14828 zero-extend all negative displacements, including -1(%rsp).
14829 However, for small negative displacements, sign-extension
14830 won't cause overflow. We only zero-extend negative
14831 displacements if they < -16*1024*1024, which is also used
14832 to check legitimate address displacements for PIC. */
14833 code = 'k';
14836 if (ASSEMBLER_DIALECT == ASM_ATT)
14838 if (disp)
14840 if (flag_pic)
14841 output_pic_addr_const (file, disp, 0);
14842 else if (GET_CODE (disp) == LABEL_REF)
14843 output_asm_label (disp);
14844 else
14845 output_addr_const (file, disp);
14848 putc ('(', file);
14849 if (base)
14850 print_reg (base, code, file);
14851 if (index)
14853 putc (',', file);
14854 print_reg (index, vsib ? 0 : code, file);
14855 if (scale != 1 || vsib)
14856 fprintf (file, ",%d", scale);
14858 putc (')', file);
14860 else
14862 rtx offset = NULL_RTX;
14864 if (disp)
14866 /* Pull out the offset of a symbol; print any symbol itself. */
14867 if (GET_CODE (disp) == CONST
14868 && GET_CODE (XEXP (disp, 0)) == PLUS
14869 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14871 offset = XEXP (XEXP (disp, 0), 1);
14872 disp = gen_rtx_CONST (VOIDmode,
14873 XEXP (XEXP (disp, 0), 0));
14876 if (flag_pic)
14877 output_pic_addr_const (file, disp, 0);
14878 else if (GET_CODE (disp) == LABEL_REF)
14879 output_asm_label (disp);
14880 else if (CONST_INT_P (disp))
14881 offset = disp;
14882 else
14883 output_addr_const (file, disp);
14886 putc ('[', file);
14887 if (base)
14889 print_reg (base, code, file);
14890 if (offset)
14892 if (INTVAL (offset) >= 0)
14893 putc ('+', file);
14894 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14897 else if (offset)
14898 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14899 else
14900 putc ('0', file);
14902 if (index)
14904 putc ('+', file);
14905 print_reg (index, vsib ? 0 : code, file);
14906 if (scale != 1 || vsib)
14907 fprintf (file, "*%d", scale);
14909 putc (']', file);
14914 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
14916 static bool
14917 i386_asm_output_addr_const_extra (FILE *file, rtx x)
14919 rtx op;
14921 if (GET_CODE (x) != UNSPEC)
14922 return false;
14924 op = XVECEXP (x, 0, 0);
14925 switch (XINT (x, 1))
14927 case UNSPEC_GOTTPOFF:
14928 output_addr_const (file, op);
14929 /* FIXME: This might be @TPOFF in Sun ld. */
14930 fputs ("@gottpoff", file);
14931 break;
14932 case UNSPEC_TPOFF:
14933 output_addr_const (file, op);
14934 fputs ("@tpoff", file);
14935 break;
14936 case UNSPEC_NTPOFF:
14937 output_addr_const (file, op);
14938 if (TARGET_64BIT)
14939 fputs ("@tpoff", file);
14940 else
14941 fputs ("@ntpoff", file);
14942 break;
14943 case UNSPEC_DTPOFF:
14944 output_addr_const (file, op);
14945 fputs ("@dtpoff", file);
14946 break;
14947 case UNSPEC_GOTNTPOFF:
14948 output_addr_const (file, op);
14949 if (TARGET_64BIT)
14950 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
14951 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
14952 else
14953 fputs ("@gotntpoff", file);
14954 break;
14955 case UNSPEC_INDNTPOFF:
14956 output_addr_const (file, op);
14957 fputs ("@indntpoff", file);
14958 break;
14959 #if TARGET_MACHO
14960 case UNSPEC_MACHOPIC_OFFSET:
14961 output_addr_const (file, op);
14962 putc ('-', file);
14963 machopic_output_function_base_name (file);
14964 break;
14965 #endif
14967 case UNSPEC_STACK_CHECK:
14969 int offset;
14971 gcc_assert (flag_split_stack);
14973 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
14974 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
14975 #else
14976 gcc_unreachable ();
14977 #endif
14979 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
14981 break;
14983 default:
14984 return false;
14987 return true;
14990 /* Split one or more double-mode RTL references into pairs of half-mode
14991 references. The RTL can be REG, offsettable MEM, integer constant, or
14992 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
14993 split and "num" is its length. lo_half and hi_half are output arrays
14994 that parallel "operands". */
14996 void
14997 split_double_mode (enum machine_mode mode, rtx operands[],
14998 int num, rtx lo_half[], rtx hi_half[])
15000 enum machine_mode half_mode;
15001 unsigned int byte;
15003 switch (mode)
15005 case TImode:
15006 half_mode = DImode;
15007 break;
15008 case DImode:
15009 half_mode = SImode;
15010 break;
15011 default:
15012 gcc_unreachable ();
15015 byte = GET_MODE_SIZE (half_mode);
15017 while (num--)
15019 rtx op = operands[num];
15021 /* simplify_subreg refuse to split volatile memory addresses,
15022 but we still have to handle it. */
15023 if (MEM_P (op))
15025 lo_half[num] = adjust_address (op, half_mode, 0);
15026 hi_half[num] = adjust_address (op, half_mode, byte);
15028 else
15030 lo_half[num] = simplify_gen_subreg (half_mode, op,
15031 GET_MODE (op) == VOIDmode
15032 ? mode : GET_MODE (op), 0);
15033 hi_half[num] = simplify_gen_subreg (half_mode, op,
15034 GET_MODE (op) == VOIDmode
15035 ? mode : GET_MODE (op), byte);
15040 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
15041 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
15042 is the expression of the binary operation. The output may either be
15043 emitted here, or returned to the caller, like all output_* functions.
15045 There is no guarantee that the operands are the same mode, as they
15046 might be within FLOAT or FLOAT_EXTEND expressions. */
15048 #ifndef SYSV386_COMPAT
15049 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
15050 wants to fix the assemblers because that causes incompatibility
15051 with gcc. No-one wants to fix gcc because that causes
15052 incompatibility with assemblers... You can use the option of
15053 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
15054 #define SYSV386_COMPAT 1
15055 #endif
15057 const char *
15058 output_387_binary_op (rtx insn, rtx *operands)
15060 static char buf[40];
15061 const char *p;
15062 const char *ssep;
15063 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
15065 #ifdef ENABLE_CHECKING
15066 /* Even if we do not want to check the inputs, this documents input
15067 constraints. Which helps in understanding the following code. */
15068 if (STACK_REG_P (operands[0])
15069 && ((REG_P (operands[1])
15070 && REGNO (operands[0]) == REGNO (operands[1])
15071 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
15072 || (REG_P (operands[2])
15073 && REGNO (operands[0]) == REGNO (operands[2])
15074 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
15075 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
15076 ; /* ok */
15077 else
15078 gcc_assert (is_sse);
15079 #endif
15081 switch (GET_CODE (operands[3]))
15083 case PLUS:
15084 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15085 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15086 p = "fiadd";
15087 else
15088 p = "fadd";
15089 ssep = "vadd";
15090 break;
15092 case MINUS:
15093 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15094 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15095 p = "fisub";
15096 else
15097 p = "fsub";
15098 ssep = "vsub";
15099 break;
15101 case MULT:
15102 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15103 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15104 p = "fimul";
15105 else
15106 p = "fmul";
15107 ssep = "vmul";
15108 break;
15110 case DIV:
15111 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15112 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15113 p = "fidiv";
15114 else
15115 p = "fdiv";
15116 ssep = "vdiv";
15117 break;
15119 default:
15120 gcc_unreachable ();
15123 if (is_sse)
15125 if (TARGET_AVX)
15127 strcpy (buf, ssep);
15128 if (GET_MODE (operands[0]) == SFmode)
15129 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
15130 else
15131 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
15133 else
15135 strcpy (buf, ssep + 1);
15136 if (GET_MODE (operands[0]) == SFmode)
15137 strcat (buf, "ss\t{%2, %0|%0, %2}");
15138 else
15139 strcat (buf, "sd\t{%2, %0|%0, %2}");
15141 return buf;
15143 strcpy (buf, p);
15145 switch (GET_CODE (operands[3]))
15147 case MULT:
15148 case PLUS:
15149 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
15151 rtx temp = operands[2];
15152 operands[2] = operands[1];
15153 operands[1] = temp;
15156 /* know operands[0] == operands[1]. */
15158 if (MEM_P (operands[2]))
15160 p = "%Z2\t%2";
15161 break;
15164 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15166 if (STACK_TOP_P (operands[0]))
15167 /* How is it that we are storing to a dead operand[2]?
15168 Well, presumably operands[1] is dead too. We can't
15169 store the result to st(0) as st(0) gets popped on this
15170 instruction. Instead store to operands[2] (which I
15171 think has to be st(1)). st(1) will be popped later.
15172 gcc <= 2.8.1 didn't have this check and generated
15173 assembly code that the Unixware assembler rejected. */
15174 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15175 else
15176 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15177 break;
15180 if (STACK_TOP_P (operands[0]))
15181 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15182 else
15183 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15184 break;
15186 case MINUS:
15187 case DIV:
15188 if (MEM_P (operands[1]))
15190 p = "r%Z1\t%1";
15191 break;
15194 if (MEM_P (operands[2]))
15196 p = "%Z2\t%2";
15197 break;
15200 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15202 #if SYSV386_COMPAT
15203 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
15204 derived assemblers, confusingly reverse the direction of
15205 the operation for fsub{r} and fdiv{r} when the
15206 destination register is not st(0). The Intel assembler
15207 doesn't have this brain damage. Read !SYSV386_COMPAT to
15208 figure out what the hardware really does. */
15209 if (STACK_TOP_P (operands[0]))
15210 p = "{p\t%0, %2|rp\t%2, %0}";
15211 else
15212 p = "{rp\t%2, %0|p\t%0, %2}";
15213 #else
15214 if (STACK_TOP_P (operands[0]))
15215 /* As above for fmul/fadd, we can't store to st(0). */
15216 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15217 else
15218 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15219 #endif
15220 break;
15223 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
15225 #if SYSV386_COMPAT
15226 if (STACK_TOP_P (operands[0]))
15227 p = "{rp\t%0, %1|p\t%1, %0}";
15228 else
15229 p = "{p\t%1, %0|rp\t%0, %1}";
15230 #else
15231 if (STACK_TOP_P (operands[0]))
15232 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
15233 else
15234 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
15235 #endif
15236 break;
15239 if (STACK_TOP_P (operands[0]))
15241 if (STACK_TOP_P (operands[1]))
15242 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15243 else
15244 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
15245 break;
15247 else if (STACK_TOP_P (operands[1]))
15249 #if SYSV386_COMPAT
15250 p = "{\t%1, %0|r\t%0, %1}";
15251 #else
15252 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
15253 #endif
15255 else
15257 #if SYSV386_COMPAT
15258 p = "{r\t%2, %0|\t%0, %2}";
15259 #else
15260 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15261 #endif
15263 break;
15265 default:
15266 gcc_unreachable ();
15269 strcat (buf, p);
15270 return buf;
15273 /* Check if a 256bit AVX register is referenced inside of EXP. */
15275 static int
15276 ix86_check_avx256_register (rtx *pexp, void *data ATTRIBUTE_UNUSED)
15278 rtx exp = *pexp;
15280 if (GET_CODE (exp) == SUBREG)
15281 exp = SUBREG_REG (exp);
15283 if (REG_P (exp)
15284 && VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp)))
15285 return 1;
15287 return 0;
15290 /* Return needed mode for entity in optimize_mode_switching pass. */
15292 static int
15293 ix86_avx_u128_mode_needed (rtx insn)
15295 if (CALL_P (insn))
15297 rtx link;
15299 /* Needed mode is set to AVX_U128_CLEAN if there are
15300 no 256bit modes used in function arguments. */
15301 for (link = CALL_INSN_FUNCTION_USAGE (insn);
15302 link;
15303 link = XEXP (link, 1))
15305 if (GET_CODE (XEXP (link, 0)) == USE)
15307 rtx arg = XEXP (XEXP (link, 0), 0);
15309 if (ix86_check_avx256_register (&arg, NULL))
15310 return AVX_U128_ANY;
15314 return AVX_U128_CLEAN;
15317 /* Require DIRTY mode if a 256bit AVX register is referenced. Hardware
15318 changes state only when a 256bit register is written to, but we need
15319 to prevent the compiler from moving optimal insertion point above
15320 eventual read from 256bit register. */
15321 if (for_each_rtx (&PATTERN (insn), ix86_check_avx256_register, NULL))
15322 return AVX_U128_DIRTY;
15324 return AVX_U128_ANY;
15327 /* Return mode that i387 must be switched into
15328 prior to the execution of insn. */
15330 static int
15331 ix86_i387_mode_needed (int entity, rtx insn)
15333 enum attr_i387_cw mode;
15335 /* The mode UNINITIALIZED is used to store control word after a
15336 function call or ASM pattern. The mode ANY specify that function
15337 has no requirements on the control word and make no changes in the
15338 bits we are interested in. */
15340 if (CALL_P (insn)
15341 || (NONJUMP_INSN_P (insn)
15342 && (asm_noperands (PATTERN (insn)) >= 0
15343 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
15344 return I387_CW_UNINITIALIZED;
15346 if (recog_memoized (insn) < 0)
15347 return I387_CW_ANY;
15349 mode = get_attr_i387_cw (insn);
15351 switch (entity)
15353 case I387_TRUNC:
15354 if (mode == I387_CW_TRUNC)
15355 return mode;
15356 break;
15358 case I387_FLOOR:
15359 if (mode == I387_CW_FLOOR)
15360 return mode;
15361 break;
15363 case I387_CEIL:
15364 if (mode == I387_CW_CEIL)
15365 return mode;
15366 break;
15368 case I387_MASK_PM:
15369 if (mode == I387_CW_MASK_PM)
15370 return mode;
15371 break;
15373 default:
15374 gcc_unreachable ();
15377 return I387_CW_ANY;
15380 /* Return mode that entity must be switched into
15381 prior to the execution of insn. */
15384 ix86_mode_needed (int entity, rtx insn)
15386 switch (entity)
15388 case AVX_U128:
15389 return ix86_avx_u128_mode_needed (insn);
15390 case I387_TRUNC:
15391 case I387_FLOOR:
15392 case I387_CEIL:
15393 case I387_MASK_PM:
15394 return ix86_i387_mode_needed (entity, insn);
15395 default:
15396 gcc_unreachable ();
15398 return 0;
15401 /* Check if a 256bit AVX register is referenced in stores. */
15403 static void
15404 ix86_check_avx256_stores (rtx dest, const_rtx set ATTRIBUTE_UNUSED, void *data)
15406 if (ix86_check_avx256_register (&dest, NULL))
15408 bool *used = (bool *) data;
15409 *used = true;
15413 /* Calculate mode of upper 128bit AVX registers after the insn. */
15415 static int
15416 ix86_avx_u128_mode_after (int mode, rtx insn)
15418 rtx pat = PATTERN (insn);
15420 if (vzeroupper_operation (pat, VOIDmode)
15421 || vzeroall_operation (pat, VOIDmode))
15422 return AVX_U128_CLEAN;
15424 /* We know that state is clean after CALL insn if there are no
15425 256bit registers used in the function return register. */
15426 if (CALL_P (insn))
15428 bool avx_reg256_found = false;
15429 note_stores (pat, ix86_check_avx256_stores, &avx_reg256_found);
15430 if (!avx_reg256_found)
15431 return AVX_U128_CLEAN;
15434 /* Otherwise, return current mode. Remember that if insn
15435 references AVX 256bit registers, the mode was already changed
15436 to DIRTY from MODE_NEEDED. */
15437 return mode;
15440 /* Return the mode that an insn results in. */
15443 ix86_mode_after (int entity, int mode, rtx insn)
15445 switch (entity)
15447 case AVX_U128:
15448 return ix86_avx_u128_mode_after (mode, insn);
15449 case I387_TRUNC:
15450 case I387_FLOOR:
15451 case I387_CEIL:
15452 case I387_MASK_PM:
15453 return mode;
15454 default:
15455 gcc_unreachable ();
15459 static int
15460 ix86_avx_u128_mode_entry (void)
15462 tree arg;
15464 /* Entry mode is set to AVX_U128_DIRTY if there are
15465 256bit modes used in function arguments. */
15466 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
15467 arg = TREE_CHAIN (arg))
15469 rtx incoming = DECL_INCOMING_RTL (arg);
15471 if (incoming && ix86_check_avx256_register (&incoming, NULL))
15472 return AVX_U128_DIRTY;
15475 return AVX_U128_CLEAN;
15478 /* Return a mode that ENTITY is assumed to be
15479 switched to at function entry. */
15482 ix86_mode_entry (int entity)
15484 switch (entity)
15486 case AVX_U128:
15487 return ix86_avx_u128_mode_entry ();
15488 case I387_TRUNC:
15489 case I387_FLOOR:
15490 case I387_CEIL:
15491 case I387_MASK_PM:
15492 return I387_CW_ANY;
15493 default:
15494 gcc_unreachable ();
15498 static int
15499 ix86_avx_u128_mode_exit (void)
15501 rtx reg = crtl->return_rtx;
15503 /* Exit mode is set to AVX_U128_DIRTY if there are
15504 256bit modes used in the function return register. */
15505 if (reg && ix86_check_avx256_register (&reg, NULL))
15506 return AVX_U128_DIRTY;
15508 return AVX_U128_CLEAN;
15511 /* Return a mode that ENTITY is assumed to be
15512 switched to at function exit. */
15515 ix86_mode_exit (int entity)
15517 switch (entity)
15519 case AVX_U128:
15520 return ix86_avx_u128_mode_exit ();
15521 case I387_TRUNC:
15522 case I387_FLOOR:
15523 case I387_CEIL:
15524 case I387_MASK_PM:
15525 return I387_CW_ANY;
15526 default:
15527 gcc_unreachable ();
15531 /* Output code to initialize control word copies used by trunc?f?i and
15532 rounding patterns. CURRENT_MODE is set to current control word,
15533 while NEW_MODE is set to new control word. */
15535 static void
15536 emit_i387_cw_initialization (int mode)
15538 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
15539 rtx new_mode;
15541 enum ix86_stack_slot slot;
15543 rtx reg = gen_reg_rtx (HImode);
15545 emit_insn (gen_x86_fnstcw_1 (stored_mode));
15546 emit_move_insn (reg, copy_rtx (stored_mode));
15548 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
15549 || optimize_function_for_size_p (cfun))
15551 switch (mode)
15553 case I387_CW_TRUNC:
15554 /* round toward zero (truncate) */
15555 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
15556 slot = SLOT_CW_TRUNC;
15557 break;
15559 case I387_CW_FLOOR:
15560 /* round down toward -oo */
15561 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15562 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
15563 slot = SLOT_CW_FLOOR;
15564 break;
15566 case I387_CW_CEIL:
15567 /* round up toward +oo */
15568 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15569 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
15570 slot = SLOT_CW_CEIL;
15571 break;
15573 case I387_CW_MASK_PM:
15574 /* mask precision exception for nearbyint() */
15575 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15576 slot = SLOT_CW_MASK_PM;
15577 break;
15579 default:
15580 gcc_unreachable ();
15583 else
15585 switch (mode)
15587 case I387_CW_TRUNC:
15588 /* round toward zero (truncate) */
15589 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
15590 slot = SLOT_CW_TRUNC;
15591 break;
15593 case I387_CW_FLOOR:
15594 /* round down toward -oo */
15595 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
15596 slot = SLOT_CW_FLOOR;
15597 break;
15599 case I387_CW_CEIL:
15600 /* round up toward +oo */
15601 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
15602 slot = SLOT_CW_CEIL;
15603 break;
15605 case I387_CW_MASK_PM:
15606 /* mask precision exception for nearbyint() */
15607 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15608 slot = SLOT_CW_MASK_PM;
15609 break;
15611 default:
15612 gcc_unreachable ();
15616 gcc_assert (slot < MAX_386_STACK_LOCALS);
15618 new_mode = assign_386_stack_local (HImode, slot);
15619 emit_move_insn (new_mode, reg);
15622 /* Emit vzeroupper. */
15624 void
15625 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
15627 int i;
15629 /* Cancel automatic vzeroupper insertion if there are
15630 live call-saved SSE registers at the insertion point. */
15632 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
15633 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
15634 return;
15636 if (TARGET_64BIT)
15637 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
15638 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
15639 return;
15641 emit_insn (gen_avx_vzeroupper ());
15644 /* Generate one or more insns to set ENTITY to MODE. */
15646 void
15647 ix86_emit_mode_set (int entity, int mode, HARD_REG_SET regs_live)
15649 switch (entity)
15651 case AVX_U128:
15652 if (mode == AVX_U128_CLEAN)
15653 ix86_avx_emit_vzeroupper (regs_live);
15654 break;
15655 case I387_TRUNC:
15656 case I387_FLOOR:
15657 case I387_CEIL:
15658 case I387_MASK_PM:
15659 if (mode != I387_CW_ANY
15660 && mode != I387_CW_UNINITIALIZED)
15661 emit_i387_cw_initialization (mode);
15662 break;
15663 default:
15664 gcc_unreachable ();
15668 /* Output code for INSN to convert a float to a signed int. OPERANDS
15669 are the insn operands. The output may be [HSD]Imode and the input
15670 operand may be [SDX]Fmode. */
15672 const char *
15673 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
15675 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15676 int dimode_p = GET_MODE (operands[0]) == DImode;
15677 int round_mode = get_attr_i387_cw (insn);
15679 /* Jump through a hoop or two for DImode, since the hardware has no
15680 non-popping instruction. We used to do this a different way, but
15681 that was somewhat fragile and broke with post-reload splitters. */
15682 if ((dimode_p || fisttp) && !stack_top_dies)
15683 output_asm_insn ("fld\t%y1", operands);
15685 gcc_assert (STACK_TOP_P (operands[1]));
15686 gcc_assert (MEM_P (operands[0]));
15687 gcc_assert (GET_MODE (operands[1]) != TFmode);
15689 if (fisttp)
15690 output_asm_insn ("fisttp%Z0\t%0", operands);
15691 else
15693 if (round_mode != I387_CW_ANY)
15694 output_asm_insn ("fldcw\t%3", operands);
15695 if (stack_top_dies || dimode_p)
15696 output_asm_insn ("fistp%Z0\t%0", operands);
15697 else
15698 output_asm_insn ("fist%Z0\t%0", operands);
15699 if (round_mode != I387_CW_ANY)
15700 output_asm_insn ("fldcw\t%2", operands);
15703 return "";
15706 /* Output code for x87 ffreep insn. The OPNO argument, which may only
15707 have the values zero or one, indicates the ffreep insn's operand
15708 from the OPERANDS array. */
15710 static const char *
15711 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
15713 if (TARGET_USE_FFREEP)
15714 #ifdef HAVE_AS_IX86_FFREEP
15715 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
15716 #else
15718 static char retval[32];
15719 int regno = REGNO (operands[opno]);
15721 gcc_assert (STACK_REGNO_P (regno));
15723 regno -= FIRST_STACK_REG;
15725 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
15726 return retval;
15728 #endif
15730 return opno ? "fstp\t%y1" : "fstp\t%y0";
15734 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
15735 should be used. UNORDERED_P is true when fucom should be used. */
15737 const char *
15738 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
15740 int stack_top_dies;
15741 rtx cmp_op0, cmp_op1;
15742 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
15744 if (eflags_p)
15746 cmp_op0 = operands[0];
15747 cmp_op1 = operands[1];
15749 else
15751 cmp_op0 = operands[1];
15752 cmp_op1 = operands[2];
15755 if (is_sse)
15757 if (GET_MODE (operands[0]) == SFmode)
15758 if (unordered_p)
15759 return "%vucomiss\t{%1, %0|%0, %1}";
15760 else
15761 return "%vcomiss\t{%1, %0|%0, %1}";
15762 else
15763 if (unordered_p)
15764 return "%vucomisd\t{%1, %0|%0, %1}";
15765 else
15766 return "%vcomisd\t{%1, %0|%0, %1}";
15769 gcc_assert (STACK_TOP_P (cmp_op0));
15771 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15773 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
15775 if (stack_top_dies)
15777 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
15778 return output_387_ffreep (operands, 1);
15780 else
15781 return "ftst\n\tfnstsw\t%0";
15784 if (STACK_REG_P (cmp_op1)
15785 && stack_top_dies
15786 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
15787 && REGNO (cmp_op1) != FIRST_STACK_REG)
15789 /* If both the top of the 387 stack dies, and the other operand
15790 is also a stack register that dies, then this must be a
15791 `fcompp' float compare */
15793 if (eflags_p)
15795 /* There is no double popping fcomi variant. Fortunately,
15796 eflags is immune from the fstp's cc clobbering. */
15797 if (unordered_p)
15798 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
15799 else
15800 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
15801 return output_387_ffreep (operands, 0);
15803 else
15805 if (unordered_p)
15806 return "fucompp\n\tfnstsw\t%0";
15807 else
15808 return "fcompp\n\tfnstsw\t%0";
15811 else
15813 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
15815 static const char * const alt[16] =
15817 "fcom%Z2\t%y2\n\tfnstsw\t%0",
15818 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
15819 "fucom%Z2\t%y2\n\tfnstsw\t%0",
15820 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
15822 "ficom%Z2\t%y2\n\tfnstsw\t%0",
15823 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
15824 NULL,
15825 NULL,
15827 "fcomi\t{%y1, %0|%0, %y1}",
15828 "fcomip\t{%y1, %0|%0, %y1}",
15829 "fucomi\t{%y1, %0|%0, %y1}",
15830 "fucomip\t{%y1, %0|%0, %y1}",
15832 NULL,
15833 NULL,
15834 NULL,
15835 NULL
15838 int mask;
15839 const char *ret;
15841 mask = eflags_p << 3;
15842 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
15843 mask |= unordered_p << 1;
15844 mask |= stack_top_dies;
15846 gcc_assert (mask < 16);
15847 ret = alt[mask];
15848 gcc_assert (ret);
15850 return ret;
15854 void
15855 ix86_output_addr_vec_elt (FILE *file, int value)
15857 const char *directive = ASM_LONG;
15859 #ifdef ASM_QUAD
15860 if (TARGET_LP64)
15861 directive = ASM_QUAD;
15862 #else
15863 gcc_assert (!TARGET_64BIT);
15864 #endif
15866 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
15869 void
15870 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
15872 const char *directive = ASM_LONG;
15874 #ifdef ASM_QUAD
15875 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
15876 directive = ASM_QUAD;
15877 #else
15878 gcc_assert (!TARGET_64BIT);
15879 #endif
15880 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
15881 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
15882 fprintf (file, "%s%s%d-%s%d\n",
15883 directive, LPREFIX, value, LPREFIX, rel);
15884 else if (HAVE_AS_GOTOFF_IN_DATA)
15885 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
15886 #if TARGET_MACHO
15887 else if (TARGET_MACHO)
15889 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
15890 machopic_output_function_base_name (file);
15891 putc ('\n', file);
15893 #endif
15894 else
15895 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
15896 GOT_SYMBOL_NAME, LPREFIX, value);
15899 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
15900 for the target. */
15902 void
15903 ix86_expand_clear (rtx dest)
15905 rtx tmp;
15907 /* We play register width games, which are only valid after reload. */
15908 gcc_assert (reload_completed);
15910 /* Avoid HImode and its attendant prefix byte. */
15911 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
15912 dest = gen_rtx_REG (SImode, REGNO (dest));
15913 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
15915 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
15916 if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
15918 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
15919 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
15922 emit_insn (tmp);
15925 /* X is an unchanging MEM. If it is a constant pool reference, return
15926 the constant pool rtx, else NULL. */
15929 maybe_get_pool_constant (rtx x)
15931 x = ix86_delegitimize_address (XEXP (x, 0));
15933 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
15934 return get_pool_constant (x);
15936 return NULL_RTX;
15939 void
15940 ix86_expand_move (enum machine_mode mode, rtx operands[])
15942 rtx op0, op1;
15943 enum tls_model model;
15945 op0 = operands[0];
15946 op1 = operands[1];
15948 if (GET_CODE (op1) == SYMBOL_REF)
15950 model = SYMBOL_REF_TLS_MODEL (op1);
15951 if (model)
15953 op1 = legitimize_tls_address (op1, model, true);
15954 op1 = force_operand (op1, op0);
15955 if (op1 == op0)
15956 return;
15957 op1 = convert_to_mode (mode, op1, 1);
15959 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15960 && SYMBOL_REF_DLLIMPORT_P (op1))
15961 op1 = legitimize_dllimport_symbol (op1, false);
15963 else if (GET_CODE (op1) == CONST
15964 && GET_CODE (XEXP (op1, 0)) == PLUS
15965 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
15967 rtx addend = XEXP (XEXP (op1, 0), 1);
15968 rtx symbol = XEXP (XEXP (op1, 0), 0);
15969 rtx tmp = NULL;
15971 model = SYMBOL_REF_TLS_MODEL (symbol);
15972 if (model)
15973 tmp = legitimize_tls_address (symbol, model, true);
15974 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15975 && SYMBOL_REF_DLLIMPORT_P (symbol))
15976 tmp = legitimize_dllimport_symbol (symbol, true);
15978 if (tmp)
15980 tmp = force_operand (tmp, NULL);
15981 tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
15982 op0, 1, OPTAB_DIRECT);
15983 if (tmp == op0)
15984 return;
15985 op1 = convert_to_mode (mode, tmp, 1);
15989 if ((flag_pic || MACHOPIC_INDIRECT)
15990 && symbolic_operand (op1, mode))
15992 if (TARGET_MACHO && !TARGET_64BIT)
15994 #if TARGET_MACHO
15995 /* dynamic-no-pic */
15996 if (MACHOPIC_INDIRECT)
15998 rtx temp = ((reload_in_progress
15999 || ((op0 && REG_P (op0))
16000 && mode == Pmode))
16001 ? op0 : gen_reg_rtx (Pmode));
16002 op1 = machopic_indirect_data_reference (op1, temp);
16003 if (MACHOPIC_PURE)
16004 op1 = machopic_legitimize_pic_address (op1, mode,
16005 temp == op1 ? 0 : temp);
16007 if (op0 != op1 && GET_CODE (op0) != MEM)
16009 rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
16010 emit_insn (insn);
16011 return;
16013 if (GET_CODE (op0) == MEM)
16014 op1 = force_reg (Pmode, op1);
16015 else
16017 rtx temp = op0;
16018 if (GET_CODE (temp) != REG)
16019 temp = gen_reg_rtx (Pmode);
16020 temp = legitimize_pic_address (op1, temp);
16021 if (temp == op0)
16022 return;
16023 op1 = temp;
16025 /* dynamic-no-pic */
16026 #endif
16028 else
16030 if (MEM_P (op0))
16031 op1 = force_reg (mode, op1);
16032 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
16034 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
16035 op1 = legitimize_pic_address (op1, reg);
16036 if (op0 == op1)
16037 return;
16038 op1 = convert_to_mode (mode, op1, 1);
16042 else
16044 if (MEM_P (op0)
16045 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
16046 || !push_operand (op0, mode))
16047 && MEM_P (op1))
16048 op1 = force_reg (mode, op1);
16050 if (push_operand (op0, mode)
16051 && ! general_no_elim_operand (op1, mode))
16052 op1 = copy_to_mode_reg (mode, op1);
16054 /* Force large constants in 64bit compilation into register
16055 to get them CSEed. */
16056 if (can_create_pseudo_p ()
16057 && (mode == DImode) && TARGET_64BIT
16058 && immediate_operand (op1, mode)
16059 && !x86_64_zext_immediate_operand (op1, VOIDmode)
16060 && !register_operand (op0, mode)
16061 && optimize)
16062 op1 = copy_to_mode_reg (mode, op1);
16064 if (can_create_pseudo_p ()
16065 && FLOAT_MODE_P (mode)
16066 && GET_CODE (op1) == CONST_DOUBLE)
16068 /* If we are loading a floating point constant to a register,
16069 force the value to memory now, since we'll get better code
16070 out the back end. */
16072 op1 = validize_mem (force_const_mem (mode, op1));
16073 if (!register_operand (op0, mode))
16075 rtx temp = gen_reg_rtx (mode);
16076 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
16077 emit_move_insn (op0, temp);
16078 return;
16083 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16086 void
16087 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
16089 rtx op0 = operands[0], op1 = operands[1];
16090 unsigned int align = GET_MODE_ALIGNMENT (mode);
16092 /* Force constants other than zero into memory. We do not know how
16093 the instructions used to build constants modify the upper 64 bits
16094 of the register, once we have that information we may be able
16095 to handle some of them more efficiently. */
16096 if (can_create_pseudo_p ()
16097 && register_operand (op0, mode)
16098 && (CONSTANT_P (op1)
16099 || (GET_CODE (op1) == SUBREG
16100 && CONSTANT_P (SUBREG_REG (op1))))
16101 && !standard_sse_constant_p (op1))
16102 op1 = validize_mem (force_const_mem (mode, op1));
16104 /* We need to check memory alignment for SSE mode since attribute
16105 can make operands unaligned. */
16106 if (can_create_pseudo_p ()
16107 && SSE_REG_MODE_P (mode)
16108 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
16109 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
16111 rtx tmp[2];
16113 /* ix86_expand_vector_move_misalign() does not like constants ... */
16114 if (CONSTANT_P (op1)
16115 || (GET_CODE (op1) == SUBREG
16116 && CONSTANT_P (SUBREG_REG (op1))))
16117 op1 = validize_mem (force_const_mem (mode, op1));
16119 /* ... nor both arguments in memory. */
16120 if (!register_operand (op0, mode)
16121 && !register_operand (op1, mode))
16122 op1 = force_reg (mode, op1);
16124 tmp[0] = op0; tmp[1] = op1;
16125 ix86_expand_vector_move_misalign (mode, tmp);
16126 return;
16129 /* Make operand1 a register if it isn't already. */
16130 if (can_create_pseudo_p ()
16131 && !register_operand (op0, mode)
16132 && !register_operand (op1, mode))
16134 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
16135 return;
16138 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16141 /* Split 32-byte AVX unaligned load and store if needed. */
16143 static void
16144 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
16146 rtx m;
16147 rtx (*extract) (rtx, rtx, rtx);
16148 rtx (*load_unaligned) (rtx, rtx);
16149 rtx (*store_unaligned) (rtx, rtx);
16150 enum machine_mode mode;
16152 switch (GET_MODE (op0))
16154 default:
16155 gcc_unreachable ();
16156 case V32QImode:
16157 extract = gen_avx_vextractf128v32qi;
16158 load_unaligned = gen_avx_loaddqu256;
16159 store_unaligned = gen_avx_storedqu256;
16160 mode = V16QImode;
16161 break;
16162 case V8SFmode:
16163 extract = gen_avx_vextractf128v8sf;
16164 load_unaligned = gen_avx_loadups256;
16165 store_unaligned = gen_avx_storeups256;
16166 mode = V4SFmode;
16167 break;
16168 case V4DFmode:
16169 extract = gen_avx_vextractf128v4df;
16170 load_unaligned = gen_avx_loadupd256;
16171 store_unaligned = gen_avx_storeupd256;
16172 mode = V2DFmode;
16173 break;
16176 if (MEM_P (op1))
16178 if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
16180 rtx r = gen_reg_rtx (mode);
16181 m = adjust_address (op1, mode, 0);
16182 emit_move_insn (r, m);
16183 m = adjust_address (op1, mode, 16);
16184 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
16185 emit_move_insn (op0, r);
16187 else
16188 emit_insn (load_unaligned (op0, op1));
16190 else if (MEM_P (op0))
16192 if (TARGET_AVX256_SPLIT_UNALIGNED_STORE)
16194 m = adjust_address (op0, mode, 0);
16195 emit_insn (extract (m, op1, const0_rtx));
16196 m = adjust_address (op0, mode, 16);
16197 emit_insn (extract (m, op1, const1_rtx));
16199 else
16200 emit_insn (store_unaligned (op0, op1));
16202 else
16203 gcc_unreachable ();
16206 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
16207 straight to ix86_expand_vector_move. */
16208 /* Code generation for scalar reg-reg moves of single and double precision data:
16209 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
16210 movaps reg, reg
16211 else
16212 movss reg, reg
16213 if (x86_sse_partial_reg_dependency == true)
16214 movapd reg, reg
16215 else
16216 movsd reg, reg
16218 Code generation for scalar loads of double precision data:
16219 if (x86_sse_split_regs == true)
16220 movlpd mem, reg (gas syntax)
16221 else
16222 movsd mem, reg
16224 Code generation for unaligned packed loads of single precision data
16225 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
16226 if (x86_sse_unaligned_move_optimal)
16227 movups mem, reg
16229 if (x86_sse_partial_reg_dependency == true)
16231 xorps reg, reg
16232 movlps mem, reg
16233 movhps mem+8, reg
16235 else
16237 movlps mem, reg
16238 movhps mem+8, reg
16241 Code generation for unaligned packed loads of double precision data
16242 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
16243 if (x86_sse_unaligned_move_optimal)
16244 movupd mem, reg
16246 if (x86_sse_split_regs == true)
16248 movlpd mem, reg
16249 movhpd mem+8, reg
16251 else
16253 movsd mem, reg
16254 movhpd mem+8, reg
16258 void
16259 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
16261 rtx op0, op1, m;
16263 op0 = operands[0];
16264 op1 = operands[1];
16266 if (TARGET_AVX
16267 && GET_MODE_SIZE (mode) == 32)
16269 switch (GET_MODE_CLASS (mode))
16271 case MODE_VECTOR_INT:
16272 case MODE_INT:
16273 op0 = gen_lowpart (V32QImode, op0);
16274 op1 = gen_lowpart (V32QImode, op1);
16275 /* FALLTHRU */
16277 case MODE_VECTOR_FLOAT:
16278 ix86_avx256_split_vector_move_misalign (op0, op1);
16279 break;
16281 default:
16282 gcc_unreachable ();
16285 return;
16288 if (MEM_P (op1))
16290 /* ??? If we have typed data, then it would appear that using
16291 movdqu is the only way to get unaligned data loaded with
16292 integer type. */
16293 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16295 op0 = gen_lowpart (V16QImode, op0);
16296 op1 = gen_lowpart (V16QImode, op1);
16297 /* We will eventually emit movups based on insn attributes. */
16298 emit_insn (gen_sse2_loaddqu (op0, op1));
16300 else if (TARGET_SSE2 && mode == V2DFmode)
16302 rtx zero;
16304 if (TARGET_AVX
16305 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16306 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16307 || optimize_function_for_size_p (cfun))
16309 /* We will eventually emit movups based on insn attributes. */
16310 emit_insn (gen_sse2_loadupd (op0, op1));
16311 return;
16314 /* When SSE registers are split into halves, we can avoid
16315 writing to the top half twice. */
16316 if (TARGET_SSE_SPLIT_REGS)
16318 emit_clobber (op0);
16319 zero = op0;
16321 else
16323 /* ??? Not sure about the best option for the Intel chips.
16324 The following would seem to satisfy; the register is
16325 entirely cleared, breaking the dependency chain. We
16326 then store to the upper half, with a dependency depth
16327 of one. A rumor has it that Intel recommends two movsd
16328 followed by an unpacklpd, but this is unconfirmed. And
16329 given that the dependency depth of the unpacklpd would
16330 still be one, I'm not sure why this would be better. */
16331 zero = CONST0_RTX (V2DFmode);
16334 m = adjust_address (op1, DFmode, 0);
16335 emit_insn (gen_sse2_loadlpd (op0, zero, m));
16336 m = adjust_address (op1, DFmode, 8);
16337 emit_insn (gen_sse2_loadhpd (op0, op0, m));
16339 else
16341 if (TARGET_AVX
16342 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16343 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16344 || optimize_function_for_size_p (cfun))
16346 op0 = gen_lowpart (V4SFmode, op0);
16347 op1 = gen_lowpart (V4SFmode, op1);
16348 emit_insn (gen_sse_loadups (op0, op1));
16349 return;
16352 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
16353 emit_move_insn (op0, CONST0_RTX (mode));
16354 else
16355 emit_clobber (op0);
16357 if (mode != V4SFmode)
16358 op0 = gen_lowpart (V4SFmode, op0);
16360 m = adjust_address (op1, V2SFmode, 0);
16361 emit_insn (gen_sse_loadlps (op0, op0, m));
16362 m = adjust_address (op1, V2SFmode, 8);
16363 emit_insn (gen_sse_loadhps (op0, op0, m));
16366 else if (MEM_P (op0))
16368 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16370 op0 = gen_lowpart (V16QImode, op0);
16371 op1 = gen_lowpart (V16QImode, op1);
16372 /* We will eventually emit movups based on insn attributes. */
16373 emit_insn (gen_sse2_storedqu (op0, op1));
16375 else if (TARGET_SSE2 && mode == V2DFmode)
16377 if (TARGET_AVX
16378 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16379 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16380 || optimize_function_for_size_p (cfun))
16381 /* We will eventually emit movups based on insn attributes. */
16382 emit_insn (gen_sse2_storeupd (op0, op1));
16383 else
16385 m = adjust_address (op0, DFmode, 0);
16386 emit_insn (gen_sse2_storelpd (m, op1));
16387 m = adjust_address (op0, DFmode, 8);
16388 emit_insn (gen_sse2_storehpd (m, op1));
16391 else
16393 if (mode != V4SFmode)
16394 op1 = gen_lowpart (V4SFmode, op1);
16396 if (TARGET_AVX
16397 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16398 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16399 || optimize_function_for_size_p (cfun))
16401 op0 = gen_lowpart (V4SFmode, op0);
16402 emit_insn (gen_sse_storeups (op0, op1));
16404 else
16406 m = adjust_address (op0, V2SFmode, 0);
16407 emit_insn (gen_sse_storelps (m, op1));
16408 m = adjust_address (op0, V2SFmode, 8);
16409 emit_insn (gen_sse_storehps (m, op1));
16413 else
16414 gcc_unreachable ();
16417 /* Expand a push in MODE. This is some mode for which we do not support
16418 proper push instructions, at least from the registers that we expect
16419 the value to live in. */
16421 void
16422 ix86_expand_push (enum machine_mode mode, rtx x)
16424 rtx tmp;
16426 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
16427 GEN_INT (-GET_MODE_SIZE (mode)),
16428 stack_pointer_rtx, 1, OPTAB_DIRECT);
16429 if (tmp != stack_pointer_rtx)
16430 emit_move_insn (stack_pointer_rtx, tmp);
16432 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
16434 /* When we push an operand onto stack, it has to be aligned at least
16435 at the function argument boundary. However since we don't have
16436 the argument type, we can't determine the actual argument
16437 boundary. */
16438 emit_move_insn (tmp, x);
16441 /* Helper function of ix86_fixup_binary_operands to canonicalize
16442 operand order. Returns true if the operands should be swapped. */
16444 static bool
16445 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
16446 rtx operands[])
16448 rtx dst = operands[0];
16449 rtx src1 = operands[1];
16450 rtx src2 = operands[2];
16452 /* If the operation is not commutative, we can't do anything. */
16453 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
16454 return false;
16456 /* Highest priority is that src1 should match dst. */
16457 if (rtx_equal_p (dst, src1))
16458 return false;
16459 if (rtx_equal_p (dst, src2))
16460 return true;
16462 /* Next highest priority is that immediate constants come second. */
16463 if (immediate_operand (src2, mode))
16464 return false;
16465 if (immediate_operand (src1, mode))
16466 return true;
16468 /* Lowest priority is that memory references should come second. */
16469 if (MEM_P (src2))
16470 return false;
16471 if (MEM_P (src1))
16472 return true;
16474 return false;
16478 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
16479 destination to use for the operation. If different from the true
16480 destination in operands[0], a copy operation will be required. */
16483 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
16484 rtx operands[])
16486 rtx dst = operands[0];
16487 rtx src1 = operands[1];
16488 rtx src2 = operands[2];
16490 /* Canonicalize operand order. */
16491 if (ix86_swap_binary_operands_p (code, mode, operands))
16493 rtx temp;
16495 /* It is invalid to swap operands of different modes. */
16496 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
16498 temp = src1;
16499 src1 = src2;
16500 src2 = temp;
16503 /* Both source operands cannot be in memory. */
16504 if (MEM_P (src1) && MEM_P (src2))
16506 /* Optimization: Only read from memory once. */
16507 if (rtx_equal_p (src1, src2))
16509 src2 = force_reg (mode, src2);
16510 src1 = src2;
16512 else
16513 src2 = force_reg (mode, src2);
16516 /* If the destination is memory, and we do not have matching source
16517 operands, do things in registers. */
16518 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16519 dst = gen_reg_rtx (mode);
16521 /* Source 1 cannot be a constant. */
16522 if (CONSTANT_P (src1))
16523 src1 = force_reg (mode, src1);
16525 /* Source 1 cannot be a non-matching memory. */
16526 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16527 src1 = force_reg (mode, src1);
16529 /* Improve address combine. */
16530 if (code == PLUS
16531 && GET_MODE_CLASS (mode) == MODE_INT
16532 && MEM_P (src2))
16533 src2 = force_reg (mode, src2);
16535 operands[1] = src1;
16536 operands[2] = src2;
16537 return dst;
16540 /* Similarly, but assume that the destination has already been
16541 set up properly. */
16543 void
16544 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
16545 enum machine_mode mode, rtx operands[])
16547 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
16548 gcc_assert (dst == operands[0]);
16551 /* Attempt to expand a binary operator. Make the expansion closer to the
16552 actual machine, then just general_operand, which will allow 3 separate
16553 memory references (one output, two input) in a single insn. */
16555 void
16556 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
16557 rtx operands[])
16559 rtx src1, src2, dst, op, clob;
16561 dst = ix86_fixup_binary_operands (code, mode, operands);
16562 src1 = operands[1];
16563 src2 = operands[2];
16565 /* Emit the instruction. */
16567 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
16568 if (reload_in_progress)
16570 /* Reload doesn't know about the flags register, and doesn't know that
16571 it doesn't want to clobber it. We can only do this with PLUS. */
16572 gcc_assert (code == PLUS);
16573 emit_insn (op);
16575 else if (reload_completed
16576 && code == PLUS
16577 && !rtx_equal_p (dst, src1))
16579 /* This is going to be an LEA; avoid splitting it later. */
16580 emit_insn (op);
16582 else
16584 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16585 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16588 /* Fix up the destination if needed. */
16589 if (dst != operands[0])
16590 emit_move_insn (operands[0], dst);
16593 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
16594 the given OPERANDS. */
16596 void
16597 ix86_expand_vector_logical_operator (enum rtx_code code, enum machine_mode mode,
16598 rtx operands[])
16600 rtx op1 = NULL_RTX, op2 = NULL_RTX;
16601 if (GET_CODE (operands[1]) == SUBREG)
16603 op1 = operands[1];
16604 op2 = operands[2];
16606 else if (GET_CODE (operands[2]) == SUBREG)
16608 op1 = operands[2];
16609 op2 = operands[1];
16611 /* Optimize (__m128i) d | (__m128i) e and similar code
16612 when d and e are float vectors into float vector logical
16613 insn. In C/C++ without using intrinsics there is no other way
16614 to express vector logical operation on float vectors than
16615 to cast them temporarily to integer vectors. */
16616 if (op1
16617 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16618 && ((GET_CODE (op2) == SUBREG || GET_CODE (op2) == CONST_VECTOR))
16619 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
16620 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
16621 && SUBREG_BYTE (op1) == 0
16622 && (GET_CODE (op2) == CONST_VECTOR
16623 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
16624 && SUBREG_BYTE (op2) == 0))
16625 && can_create_pseudo_p ())
16627 rtx dst;
16628 switch (GET_MODE (SUBREG_REG (op1)))
16630 case V4SFmode:
16631 case V8SFmode:
16632 case V2DFmode:
16633 case V4DFmode:
16634 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
16635 if (GET_CODE (op2) == CONST_VECTOR)
16637 op2 = gen_lowpart (GET_MODE (dst), op2);
16638 op2 = force_reg (GET_MODE (dst), op2);
16640 else
16642 op1 = operands[1];
16643 op2 = SUBREG_REG (operands[2]);
16644 if (!nonimmediate_operand (op2, GET_MODE (dst)))
16645 op2 = force_reg (GET_MODE (dst), op2);
16647 op1 = SUBREG_REG (op1);
16648 if (!nonimmediate_operand (op1, GET_MODE (dst)))
16649 op1 = force_reg (GET_MODE (dst), op1);
16650 emit_insn (gen_rtx_SET (VOIDmode, dst,
16651 gen_rtx_fmt_ee (code, GET_MODE (dst),
16652 op1, op2)));
16653 emit_move_insn (operands[0], gen_lowpart (mode, dst));
16654 return;
16655 default:
16656 break;
16659 if (!nonimmediate_operand (operands[1], mode))
16660 operands[1] = force_reg (mode, operands[1]);
16661 if (!nonimmediate_operand (operands[2], mode))
16662 operands[2] = force_reg (mode, operands[2]);
16663 ix86_fixup_binary_operands_no_copy (code, mode, operands);
16664 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
16665 gen_rtx_fmt_ee (code, mode, operands[1],
16666 operands[2])));
16669 /* Return TRUE or FALSE depending on whether the binary operator meets the
16670 appropriate constraints. */
16672 bool
16673 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
16674 rtx operands[3])
16676 rtx dst = operands[0];
16677 rtx src1 = operands[1];
16678 rtx src2 = operands[2];
16680 /* Both source operands cannot be in memory. */
16681 if (MEM_P (src1) && MEM_P (src2))
16682 return false;
16684 /* Canonicalize operand order for commutative operators. */
16685 if (ix86_swap_binary_operands_p (code, mode, operands))
16687 rtx temp = src1;
16688 src1 = src2;
16689 src2 = temp;
16692 /* If the destination is memory, we must have a matching source operand. */
16693 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16694 return false;
16696 /* Source 1 cannot be a constant. */
16697 if (CONSTANT_P (src1))
16698 return false;
16700 /* Source 1 cannot be a non-matching memory. */
16701 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16702 /* Support "andhi/andsi/anddi" as a zero-extending move. */
16703 return (code == AND
16704 && (mode == HImode
16705 || mode == SImode
16706 || (TARGET_64BIT && mode == DImode))
16707 && satisfies_constraint_L (src2));
16709 return true;
16712 /* Attempt to expand a unary operator. Make the expansion closer to the
16713 actual machine, then just general_operand, which will allow 2 separate
16714 memory references (one output, one input) in a single insn. */
16716 void
16717 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
16718 rtx operands[])
16720 int matching_memory;
16721 rtx src, dst, op, clob;
16723 dst = operands[0];
16724 src = operands[1];
16726 /* If the destination is memory, and we do not have matching source
16727 operands, do things in registers. */
16728 matching_memory = 0;
16729 if (MEM_P (dst))
16731 if (rtx_equal_p (dst, src))
16732 matching_memory = 1;
16733 else
16734 dst = gen_reg_rtx (mode);
16737 /* When source operand is memory, destination must match. */
16738 if (MEM_P (src) && !matching_memory)
16739 src = force_reg (mode, src);
16741 /* Emit the instruction. */
16743 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
16744 if (reload_in_progress || code == NOT)
16746 /* Reload doesn't know about the flags register, and doesn't know that
16747 it doesn't want to clobber it. */
16748 gcc_assert (code == NOT);
16749 emit_insn (op);
16751 else
16753 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16754 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16757 /* Fix up the destination if needed. */
16758 if (dst != operands[0])
16759 emit_move_insn (operands[0], dst);
16762 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
16763 divisor are within the range [0-255]. */
16765 void
16766 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
16767 bool signed_p)
16769 rtx end_label, qimode_label;
16770 rtx insn, div, mod;
16771 rtx scratch, tmp0, tmp1, tmp2;
16772 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
16773 rtx (*gen_zero_extend) (rtx, rtx);
16774 rtx (*gen_test_ccno_1) (rtx, rtx);
16776 switch (mode)
16778 case SImode:
16779 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
16780 gen_test_ccno_1 = gen_testsi_ccno_1;
16781 gen_zero_extend = gen_zero_extendqisi2;
16782 break;
16783 case DImode:
16784 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
16785 gen_test_ccno_1 = gen_testdi_ccno_1;
16786 gen_zero_extend = gen_zero_extendqidi2;
16787 break;
16788 default:
16789 gcc_unreachable ();
16792 end_label = gen_label_rtx ();
16793 qimode_label = gen_label_rtx ();
16795 scratch = gen_reg_rtx (mode);
16797 /* Use 8bit unsigned divimod if dividend and divisor are within
16798 the range [0-255]. */
16799 emit_move_insn (scratch, operands[2]);
16800 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
16801 scratch, 1, OPTAB_DIRECT);
16802 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
16803 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
16804 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
16805 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
16806 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
16807 pc_rtx);
16808 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
16809 predict_jump (REG_BR_PROB_BASE * 50 / 100);
16810 JUMP_LABEL (insn) = qimode_label;
16812 /* Generate original signed/unsigned divimod. */
16813 div = gen_divmod4_1 (operands[0], operands[1],
16814 operands[2], operands[3]);
16815 emit_insn (div);
16817 /* Branch to the end. */
16818 emit_jump_insn (gen_jump (end_label));
16819 emit_barrier ();
16821 /* Generate 8bit unsigned divide. */
16822 emit_label (qimode_label);
16823 /* Don't use operands[0] for result of 8bit divide since not all
16824 registers support QImode ZERO_EXTRACT. */
16825 tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
16826 tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
16827 tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
16828 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
16830 if (signed_p)
16832 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
16833 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
16835 else
16837 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
16838 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
16841 /* Extract remainder from AH. */
16842 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
16843 if (REG_P (operands[1]))
16844 insn = emit_move_insn (operands[1], tmp1);
16845 else
16847 /* Need a new scratch register since the old one has result
16848 of 8bit divide. */
16849 scratch = gen_reg_rtx (mode);
16850 emit_move_insn (scratch, tmp1);
16851 insn = emit_move_insn (operands[1], scratch);
16853 set_unique_reg_note (insn, REG_EQUAL, mod);
16855 /* Zero extend quotient from AL. */
16856 tmp1 = gen_lowpart (QImode, tmp0);
16857 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
16858 set_unique_reg_note (insn, REG_EQUAL, div);
16860 emit_label (end_label);
16863 #define LEA_MAX_STALL (3)
16864 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
16866 /* Increase given DISTANCE in half-cycles according to
16867 dependencies between PREV and NEXT instructions.
16868 Add 1 half-cycle if there is no dependency and
16869 go to next cycle if there is some dependecy. */
16871 static unsigned int
16872 increase_distance (rtx prev, rtx next, unsigned int distance)
16874 df_ref *use_rec;
16875 df_ref *def_rec;
16877 if (!prev || !next)
16878 return distance + (distance & 1) + 2;
16880 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
16881 return distance + 1;
16883 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
16884 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
16885 if (!DF_REF_IS_ARTIFICIAL (*def_rec)
16886 && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
16887 return distance + (distance & 1) + 2;
16889 return distance + 1;
16892 /* Function checks if instruction INSN defines register number
16893 REGNO1 or REGNO2. */
16895 static bool
16896 insn_defines_reg (unsigned int regno1, unsigned int regno2,
16897 rtx insn)
16899 df_ref *def_rec;
16901 for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
16902 if (DF_REF_REG_DEF_P (*def_rec)
16903 && !DF_REF_IS_ARTIFICIAL (*def_rec)
16904 && (regno1 == DF_REF_REGNO (*def_rec)
16905 || regno2 == DF_REF_REGNO (*def_rec)))
16907 return true;
16910 return false;
16913 /* Function checks if instruction INSN uses register number
16914 REGNO as a part of address expression. */
16916 static bool
16917 insn_uses_reg_mem (unsigned int regno, rtx insn)
16919 df_ref *use_rec;
16921 for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
16922 if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
16923 return true;
16925 return false;
16928 /* Search backward for non-agu definition of register number REGNO1
16929 or register number REGNO2 in basic block starting from instruction
16930 START up to head of basic block or instruction INSN.
16932 Function puts true value into *FOUND var if definition was found
16933 and false otherwise.
16935 Distance in half-cycles between START and found instruction or head
16936 of BB is added to DISTANCE and returned. */
16938 static int
16939 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
16940 rtx insn, int distance,
16941 rtx start, bool *found)
16943 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16944 rtx prev = start;
16945 rtx next = NULL;
16947 *found = false;
16949 while (prev
16950 && prev != insn
16951 && distance < LEA_SEARCH_THRESHOLD)
16953 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
16955 distance = increase_distance (prev, next, distance);
16956 if (insn_defines_reg (regno1, regno2, prev))
16958 if (recog_memoized (prev) < 0
16959 || get_attr_type (prev) != TYPE_LEA)
16961 *found = true;
16962 return distance;
16966 next = prev;
16968 if (prev == BB_HEAD (bb))
16969 break;
16971 prev = PREV_INSN (prev);
16974 return distance;
16977 /* Search backward for non-agu definition of register number REGNO1
16978 or register number REGNO2 in INSN's basic block until
16979 1. Pass LEA_SEARCH_THRESHOLD instructions, or
16980 2. Reach neighbour BBs boundary, or
16981 3. Reach agu definition.
16982 Returns the distance between the non-agu definition point and INSN.
16983 If no definition point, returns -1. */
16985 static int
16986 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
16987 rtx insn)
16989 basic_block bb = BLOCK_FOR_INSN (insn);
16990 int distance = 0;
16991 bool found = false;
16993 if (insn != BB_HEAD (bb))
16994 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
16995 distance, PREV_INSN (insn),
16996 &found);
16998 if (!found && distance < LEA_SEARCH_THRESHOLD)
17000 edge e;
17001 edge_iterator ei;
17002 bool simple_loop = false;
17004 FOR_EACH_EDGE (e, ei, bb->preds)
17005 if (e->src == bb)
17007 simple_loop = true;
17008 break;
17011 if (simple_loop)
17012 distance = distance_non_agu_define_in_bb (regno1, regno2,
17013 insn, distance,
17014 BB_END (bb), &found);
17015 else
17017 int shortest_dist = -1;
17018 bool found_in_bb = false;
17020 FOR_EACH_EDGE (e, ei, bb->preds)
17022 int bb_dist
17023 = distance_non_agu_define_in_bb (regno1, regno2,
17024 insn, distance,
17025 BB_END (e->src),
17026 &found_in_bb);
17027 if (found_in_bb)
17029 if (shortest_dist < 0)
17030 shortest_dist = bb_dist;
17031 else if (bb_dist > 0)
17032 shortest_dist = MIN (bb_dist, shortest_dist);
17034 found = true;
17038 distance = shortest_dist;
17042 /* get_attr_type may modify recog data. We want to make sure
17043 that recog data is valid for instruction INSN, on which
17044 distance_non_agu_define is called. INSN is unchanged here. */
17045 extract_insn_cached (insn);
17047 if (!found)
17048 return -1;
17050 return distance >> 1;
17053 /* Return the distance in half-cycles between INSN and the next
17054 insn that uses register number REGNO in memory address added
17055 to DISTANCE. Return -1 if REGNO0 is set.
17057 Put true value into *FOUND if register usage was found and
17058 false otherwise.
17059 Put true value into *REDEFINED if register redefinition was
17060 found and false otherwise. */
17062 static int
17063 distance_agu_use_in_bb (unsigned int regno,
17064 rtx insn, int distance, rtx start,
17065 bool *found, bool *redefined)
17067 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
17068 rtx next = start;
17069 rtx prev = NULL;
17071 *found = false;
17072 *redefined = false;
17074 while (next
17075 && next != insn
17076 && distance < LEA_SEARCH_THRESHOLD)
17078 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
17080 distance = increase_distance(prev, next, distance);
17081 if (insn_uses_reg_mem (regno, next))
17083 /* Return DISTANCE if OP0 is used in memory
17084 address in NEXT. */
17085 *found = true;
17086 return distance;
17089 if (insn_defines_reg (regno, INVALID_REGNUM, next))
17091 /* Return -1 if OP0 is set in NEXT. */
17092 *redefined = true;
17093 return -1;
17096 prev = next;
17099 if (next == BB_END (bb))
17100 break;
17102 next = NEXT_INSN (next);
17105 return distance;
17108 /* Return the distance between INSN and the next insn that uses
17109 register number REGNO0 in memory address. Return -1 if no such
17110 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
17112 static int
17113 distance_agu_use (unsigned int regno0, rtx insn)
17115 basic_block bb = BLOCK_FOR_INSN (insn);
17116 int distance = 0;
17117 bool found = false;
17118 bool redefined = false;
17120 if (insn != BB_END (bb))
17121 distance = distance_agu_use_in_bb (regno0, insn, distance,
17122 NEXT_INSN (insn),
17123 &found, &redefined);
17125 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
17127 edge e;
17128 edge_iterator ei;
17129 bool simple_loop = false;
17131 FOR_EACH_EDGE (e, ei, bb->succs)
17132 if (e->dest == bb)
17134 simple_loop = true;
17135 break;
17138 if (simple_loop)
17139 distance = distance_agu_use_in_bb (regno0, insn,
17140 distance, BB_HEAD (bb),
17141 &found, &redefined);
17142 else
17144 int shortest_dist = -1;
17145 bool found_in_bb = false;
17146 bool redefined_in_bb = false;
17148 FOR_EACH_EDGE (e, ei, bb->succs)
17150 int bb_dist
17151 = distance_agu_use_in_bb (regno0, insn,
17152 distance, BB_HEAD (e->dest),
17153 &found_in_bb, &redefined_in_bb);
17154 if (found_in_bb)
17156 if (shortest_dist < 0)
17157 shortest_dist = bb_dist;
17158 else if (bb_dist > 0)
17159 shortest_dist = MIN (bb_dist, shortest_dist);
17161 found = true;
17165 distance = shortest_dist;
17169 if (!found || redefined)
17170 return -1;
17172 return distance >> 1;
17175 /* Define this macro to tune LEA priority vs ADD, it take effect when
17176 there is a dilemma of choicing LEA or ADD
17177 Negative value: ADD is more preferred than LEA
17178 Zero: Netrual
17179 Positive value: LEA is more preferred than ADD*/
17180 #define IX86_LEA_PRIORITY 0
17182 /* Return true if usage of lea INSN has performance advantage
17183 over a sequence of instructions. Instructions sequence has
17184 SPLIT_COST cycles higher latency than lea latency. */
17186 static bool
17187 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
17188 unsigned int regno2, int split_cost)
17190 int dist_define, dist_use;
17192 dist_define = distance_non_agu_define (regno1, regno2, insn);
17193 dist_use = distance_agu_use (regno0, insn);
17195 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
17197 /* If there is no non AGU operand definition, no AGU
17198 operand usage and split cost is 0 then both lea
17199 and non lea variants have same priority. Currently
17200 we prefer lea for 64 bit code and non lea on 32 bit
17201 code. */
17202 if (dist_use < 0 && split_cost == 0)
17203 return TARGET_64BIT || IX86_LEA_PRIORITY;
17204 else
17205 return true;
17208 /* With longer definitions distance lea is more preferable.
17209 Here we change it to take into account splitting cost and
17210 lea priority. */
17211 dist_define += split_cost + IX86_LEA_PRIORITY;
17213 /* If there is no use in memory addess then we just check
17214 that split cost exceeds AGU stall. */
17215 if (dist_use < 0)
17216 return dist_define > LEA_MAX_STALL;
17218 /* If this insn has both backward non-agu dependence and forward
17219 agu dependence, the one with short distance takes effect. */
17220 return dist_define >= dist_use;
17223 /* Return true if it is legal to clobber flags by INSN and
17224 false otherwise. */
17226 static bool
17227 ix86_ok_to_clobber_flags (rtx insn)
17229 basic_block bb = BLOCK_FOR_INSN (insn);
17230 df_ref *use;
17231 bitmap live;
17233 while (insn)
17235 if (NONDEBUG_INSN_P (insn))
17237 for (use = DF_INSN_USES (insn); *use; use++)
17238 if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
17239 return false;
17241 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
17242 return true;
17245 if (insn == BB_END (bb))
17246 break;
17248 insn = NEXT_INSN (insn);
17251 live = df_get_live_out(bb);
17252 return !REGNO_REG_SET_P (live, FLAGS_REG);
17255 /* Return true if we need to split op0 = op1 + op2 into a sequence of
17256 move and add to avoid AGU stalls. */
17258 bool
17259 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
17261 unsigned int regno0, regno1, regno2;
17263 /* Check if we need to optimize. */
17264 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17265 return false;
17267 /* Check it is correct to split here. */
17268 if (!ix86_ok_to_clobber_flags(insn))
17269 return false;
17271 regno0 = true_regnum (operands[0]);
17272 regno1 = true_regnum (operands[1]);
17273 regno2 = true_regnum (operands[2]);
17275 /* We need to split only adds with non destructive
17276 destination operand. */
17277 if (regno0 == regno1 || regno0 == regno2)
17278 return false;
17279 else
17280 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1);
17283 /* Return true if we should emit lea instruction instead of mov
17284 instruction. */
17286 bool
17287 ix86_use_lea_for_mov (rtx insn, rtx operands[])
17289 unsigned int regno0, regno1;
17291 /* Check if we need to optimize. */
17292 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17293 return false;
17295 /* Use lea for reg to reg moves only. */
17296 if (!REG_P (operands[0]) || !REG_P (operands[1]))
17297 return false;
17299 regno0 = true_regnum (operands[0]);
17300 regno1 = true_regnum (operands[1]);
17302 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0);
17305 /* Return true if we need to split lea into a sequence of
17306 instructions to avoid AGU stalls. */
17308 bool
17309 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
17311 unsigned int regno0, regno1, regno2;
17312 int split_cost;
17313 struct ix86_address parts;
17314 int ok;
17316 /* Check we need to optimize. */
17317 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17318 return false;
17320 /* Check it is correct to split here. */
17321 if (!ix86_ok_to_clobber_flags(insn))
17322 return false;
17324 ok = ix86_decompose_address (operands[1], &parts);
17325 gcc_assert (ok);
17327 /* There should be at least two components in the address. */
17328 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
17329 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
17330 return false;
17332 /* We should not split into add if non legitimate pic
17333 operand is used as displacement. */
17334 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
17335 return false;
17337 regno0 = true_regnum (operands[0]) ;
17338 regno1 = INVALID_REGNUM;
17339 regno2 = INVALID_REGNUM;
17341 if (parts.base)
17342 regno1 = true_regnum (parts.base);
17343 if (parts.index)
17344 regno2 = true_regnum (parts.index);
17346 split_cost = 0;
17348 /* Compute how many cycles we will add to execution time
17349 if split lea into a sequence of instructions. */
17350 if (parts.base || parts.index)
17352 /* Have to use mov instruction if non desctructive
17353 destination form is used. */
17354 if (regno1 != regno0 && regno2 != regno0)
17355 split_cost += 1;
17357 /* Have to add index to base if both exist. */
17358 if (parts.base && parts.index)
17359 split_cost += 1;
17361 /* Have to use shift and adds if scale is 2 or greater. */
17362 if (parts.scale > 1)
17364 if (regno0 != regno1)
17365 split_cost += 1;
17366 else if (regno2 == regno0)
17367 split_cost += 4;
17368 else
17369 split_cost += parts.scale;
17372 /* Have to use add instruction with immediate if
17373 disp is non zero. */
17374 if (parts.disp && parts.disp != const0_rtx)
17375 split_cost += 1;
17377 /* Subtract the price of lea. */
17378 split_cost -= 1;
17381 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost);
17384 /* Emit x86 binary operand CODE in mode MODE, where the first operand
17385 matches destination. RTX includes clobber of FLAGS_REG. */
17387 static void
17388 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
17389 rtx dst, rtx src)
17391 rtx op, clob;
17393 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
17394 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17396 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
17399 /* Return true if regno1 def is nearest to the insn. */
17401 static bool
17402 find_nearest_reg_def (rtx insn, int regno1, int regno2)
17404 rtx prev = insn;
17405 rtx start = BB_HEAD (BLOCK_FOR_INSN (insn));
17407 if (insn == start)
17408 return false;
17409 while (prev && prev != start)
17411 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
17413 prev = PREV_INSN (prev);
17414 continue;
17416 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
17417 return true;
17418 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
17419 return false;
17420 prev = PREV_INSN (prev);
17423 /* None of the regs is defined in the bb. */
17424 return false;
17427 /* Split lea instructions into a sequence of instructions
17428 which are executed on ALU to avoid AGU stalls.
17429 It is assumed that it is allowed to clobber flags register
17430 at lea position. */
17432 void
17433 ix86_split_lea_for_addr (rtx insn, rtx operands[], enum machine_mode mode)
17435 unsigned int regno0, regno1, regno2;
17436 struct ix86_address parts;
17437 rtx target, tmp;
17438 int ok, adds;
17440 ok = ix86_decompose_address (operands[1], &parts);
17441 gcc_assert (ok);
17443 target = gen_lowpart (mode, operands[0]);
17445 regno0 = true_regnum (target);
17446 regno1 = INVALID_REGNUM;
17447 regno2 = INVALID_REGNUM;
17449 if (parts.base)
17451 parts.base = gen_lowpart (mode, parts.base);
17452 regno1 = true_regnum (parts.base);
17455 if (parts.index)
17457 parts.index = gen_lowpart (mode, parts.index);
17458 regno2 = true_regnum (parts.index);
17461 if (parts.disp)
17462 parts.disp = gen_lowpart (mode, parts.disp);
17464 if (parts.scale > 1)
17466 /* Case r1 = r1 + ... */
17467 if (regno1 == regno0)
17469 /* If we have a case r1 = r1 + C * r1 then we
17470 should use multiplication which is very
17471 expensive. Assume cost model is wrong if we
17472 have such case here. */
17473 gcc_assert (regno2 != regno0);
17475 for (adds = parts.scale; adds > 0; adds--)
17476 ix86_emit_binop (PLUS, mode, target, parts.index);
17478 else
17480 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
17481 if (regno0 != regno2)
17482 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
17484 /* Use shift for scaling. */
17485 ix86_emit_binop (ASHIFT, mode, target,
17486 GEN_INT (exact_log2 (parts.scale)));
17488 if (parts.base)
17489 ix86_emit_binop (PLUS, mode, target, parts.base);
17491 if (parts.disp && parts.disp != const0_rtx)
17492 ix86_emit_binop (PLUS, mode, target, parts.disp);
17495 else if (!parts.base && !parts.index)
17497 gcc_assert(parts.disp);
17498 emit_insn (gen_rtx_SET (VOIDmode, target, parts.disp));
17500 else
17502 if (!parts.base)
17504 if (regno0 != regno2)
17505 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
17507 else if (!parts.index)
17509 if (regno0 != regno1)
17510 emit_insn (gen_rtx_SET (VOIDmode, target, parts.base));
17512 else
17514 if (regno0 == regno1)
17515 tmp = parts.index;
17516 else if (regno0 == regno2)
17517 tmp = parts.base;
17518 else
17520 rtx tmp1;
17522 /* Find better operand for SET instruction, depending
17523 on which definition is farther from the insn. */
17524 if (find_nearest_reg_def (insn, regno1, regno2))
17525 tmp = parts.index, tmp1 = parts.base;
17526 else
17527 tmp = parts.base, tmp1 = parts.index;
17529 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
17531 if (parts.disp && parts.disp != const0_rtx)
17532 ix86_emit_binop (PLUS, mode, target, parts.disp);
17534 ix86_emit_binop (PLUS, mode, target, tmp1);
17535 return;
17538 ix86_emit_binop (PLUS, mode, target, tmp);
17541 if (parts.disp && parts.disp != const0_rtx)
17542 ix86_emit_binop (PLUS, mode, target, parts.disp);
17546 /* Return true if it is ok to optimize an ADD operation to LEA
17547 operation to avoid flag register consumation. For most processors,
17548 ADD is faster than LEA. For the processors like ATOM, if the
17549 destination register of LEA holds an actual address which will be
17550 used soon, LEA is better and otherwise ADD is better. */
17552 bool
17553 ix86_lea_for_add_ok (rtx insn, rtx operands[])
17555 unsigned int regno0 = true_regnum (operands[0]);
17556 unsigned int regno1 = true_regnum (operands[1]);
17557 unsigned int regno2 = true_regnum (operands[2]);
17559 /* If a = b + c, (a!=b && a!=c), must use lea form. */
17560 if (regno0 != regno1 && regno0 != regno2)
17561 return true;
17563 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17564 return false;
17566 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0);
17569 /* Return true if destination reg of SET_BODY is shift count of
17570 USE_BODY. */
17572 static bool
17573 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
17575 rtx set_dest;
17576 rtx shift_rtx;
17577 int i;
17579 /* Retrieve destination of SET_BODY. */
17580 switch (GET_CODE (set_body))
17582 case SET:
17583 set_dest = SET_DEST (set_body);
17584 if (!set_dest || !REG_P (set_dest))
17585 return false;
17586 break;
17587 case PARALLEL:
17588 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
17589 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
17590 use_body))
17591 return true;
17592 default:
17593 return false;
17594 break;
17597 /* Retrieve shift count of USE_BODY. */
17598 switch (GET_CODE (use_body))
17600 case SET:
17601 shift_rtx = XEXP (use_body, 1);
17602 break;
17603 case PARALLEL:
17604 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
17605 if (ix86_dep_by_shift_count_body (set_body,
17606 XVECEXP (use_body, 0, i)))
17607 return true;
17608 default:
17609 return false;
17610 break;
17613 if (shift_rtx
17614 && (GET_CODE (shift_rtx) == ASHIFT
17615 || GET_CODE (shift_rtx) == LSHIFTRT
17616 || GET_CODE (shift_rtx) == ASHIFTRT
17617 || GET_CODE (shift_rtx) == ROTATE
17618 || GET_CODE (shift_rtx) == ROTATERT))
17620 rtx shift_count = XEXP (shift_rtx, 1);
17622 /* Return true if shift count is dest of SET_BODY. */
17623 if (REG_P (shift_count))
17625 /* Add check since it can be invoked before register
17626 allocation in pre-reload schedule. */
17627 if (reload_completed
17628 && true_regnum (set_dest) == true_regnum (shift_count))
17629 return true;
17630 else if (REGNO(set_dest) == REGNO(shift_count))
17631 return true;
17635 return false;
17638 /* Return true if destination reg of SET_INSN is shift count of
17639 USE_INSN. */
17641 bool
17642 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
17644 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
17645 PATTERN (use_insn));
17648 /* Return TRUE or FALSE depending on whether the unary operator meets the
17649 appropriate constraints. */
17651 bool
17652 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
17653 enum machine_mode mode ATTRIBUTE_UNUSED,
17654 rtx operands[2] ATTRIBUTE_UNUSED)
17656 /* If one of operands is memory, source and destination must match. */
17657 if ((MEM_P (operands[0])
17658 || MEM_P (operands[1]))
17659 && ! rtx_equal_p (operands[0], operands[1]))
17660 return false;
17661 return true;
17664 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
17665 are ok, keeping in mind the possible movddup alternative. */
17667 bool
17668 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
17670 if (MEM_P (operands[0]))
17671 return rtx_equal_p (operands[0], operands[1 + high]);
17672 if (MEM_P (operands[1]) && MEM_P (operands[2]))
17673 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
17674 return true;
17677 /* Post-reload splitter for converting an SF or DFmode value in an
17678 SSE register into an unsigned SImode. */
17680 void
17681 ix86_split_convert_uns_si_sse (rtx operands[])
17683 enum machine_mode vecmode;
17684 rtx value, large, zero_or_two31, input, two31, x;
17686 large = operands[1];
17687 zero_or_two31 = operands[2];
17688 input = operands[3];
17689 two31 = operands[4];
17690 vecmode = GET_MODE (large);
17691 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
17693 /* Load up the value into the low element. We must ensure that the other
17694 elements are valid floats -- zero is the easiest such value. */
17695 if (MEM_P (input))
17697 if (vecmode == V4SFmode)
17698 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
17699 else
17700 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
17702 else
17704 input = gen_rtx_REG (vecmode, REGNO (input));
17705 emit_move_insn (value, CONST0_RTX (vecmode));
17706 if (vecmode == V4SFmode)
17707 emit_insn (gen_sse_movss (value, value, input));
17708 else
17709 emit_insn (gen_sse2_movsd (value, value, input));
17712 emit_move_insn (large, two31);
17713 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
17715 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
17716 emit_insn (gen_rtx_SET (VOIDmode, large, x));
17718 x = gen_rtx_AND (vecmode, zero_or_two31, large);
17719 emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
17721 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
17722 emit_insn (gen_rtx_SET (VOIDmode, value, x));
17724 large = gen_rtx_REG (V4SImode, REGNO (large));
17725 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
17727 x = gen_rtx_REG (V4SImode, REGNO (value));
17728 if (vecmode == V4SFmode)
17729 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
17730 else
17731 emit_insn (gen_sse2_cvttpd2dq (x, value));
17732 value = x;
17734 emit_insn (gen_xorv4si3 (value, value, large));
17737 /* Convert an unsigned DImode value into a DFmode, using only SSE.
17738 Expects the 64-bit DImode to be supplied in a pair of integral
17739 registers. Requires SSE2; will use SSE3 if available. For x86_32,
17740 -mfpmath=sse, !optimize_size only. */
17742 void
17743 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
17745 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
17746 rtx int_xmm, fp_xmm;
17747 rtx biases, exponents;
17748 rtx x;
17750 int_xmm = gen_reg_rtx (V4SImode);
17751 if (TARGET_INTER_UNIT_MOVES)
17752 emit_insn (gen_movdi_to_sse (int_xmm, input));
17753 else if (TARGET_SSE_SPLIT_REGS)
17755 emit_clobber (int_xmm);
17756 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
17758 else
17760 x = gen_reg_rtx (V2DImode);
17761 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
17762 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
17765 x = gen_rtx_CONST_VECTOR (V4SImode,
17766 gen_rtvec (4, GEN_INT (0x43300000UL),
17767 GEN_INT (0x45300000UL),
17768 const0_rtx, const0_rtx));
17769 exponents = validize_mem (force_const_mem (V4SImode, x));
17771 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
17772 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
17774 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
17775 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
17776 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
17777 (0x1.0p84 + double(fp_value_hi_xmm)).
17778 Note these exponents differ by 32. */
17780 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
17782 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
17783 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
17784 real_ldexp (&bias_lo_rvt, &dconst1, 52);
17785 real_ldexp (&bias_hi_rvt, &dconst1, 84);
17786 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
17787 x = const_double_from_real_value (bias_hi_rvt, DFmode);
17788 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
17789 biases = validize_mem (force_const_mem (V2DFmode, biases));
17790 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
17792 /* Add the upper and lower DFmode values together. */
17793 if (TARGET_SSE3)
17794 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
17795 else
17797 x = copy_to_mode_reg (V2DFmode, fp_xmm);
17798 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
17799 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
17802 ix86_expand_vector_extract (false, target, fp_xmm, 0);
17805 /* Not used, but eases macroization of patterns. */
17806 void
17807 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
17808 rtx input ATTRIBUTE_UNUSED)
17810 gcc_unreachable ();
17813 /* Convert an unsigned SImode value into a DFmode. Only currently used
17814 for SSE, but applicable anywhere. */
17816 void
17817 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
17819 REAL_VALUE_TYPE TWO31r;
17820 rtx x, fp;
17822 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
17823 NULL, 1, OPTAB_DIRECT);
17825 fp = gen_reg_rtx (DFmode);
17826 emit_insn (gen_floatsidf2 (fp, x));
17828 real_ldexp (&TWO31r, &dconst1, 31);
17829 x = const_double_from_real_value (TWO31r, DFmode);
17831 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
17832 if (x != target)
17833 emit_move_insn (target, x);
17836 /* Convert a signed DImode value into a DFmode. Only used for SSE in
17837 32-bit mode; otherwise we have a direct convert instruction. */
17839 void
17840 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
17842 REAL_VALUE_TYPE TWO32r;
17843 rtx fp_lo, fp_hi, x;
17845 fp_lo = gen_reg_rtx (DFmode);
17846 fp_hi = gen_reg_rtx (DFmode);
17848 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
17850 real_ldexp (&TWO32r, &dconst1, 32);
17851 x = const_double_from_real_value (TWO32r, DFmode);
17852 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
17854 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
17856 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
17857 0, OPTAB_DIRECT);
17858 if (x != target)
17859 emit_move_insn (target, x);
17862 /* Convert an unsigned SImode value into a SFmode, using only SSE.
17863 For x86_32, -mfpmath=sse, !optimize_size only. */
17864 void
17865 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
17867 REAL_VALUE_TYPE ONE16r;
17868 rtx fp_hi, fp_lo, int_hi, int_lo, x;
17870 real_ldexp (&ONE16r, &dconst1, 16);
17871 x = const_double_from_real_value (ONE16r, SFmode);
17872 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
17873 NULL, 0, OPTAB_DIRECT);
17874 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
17875 NULL, 0, OPTAB_DIRECT);
17876 fp_hi = gen_reg_rtx (SFmode);
17877 fp_lo = gen_reg_rtx (SFmode);
17878 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
17879 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
17880 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
17881 0, OPTAB_DIRECT);
17882 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
17883 0, OPTAB_DIRECT);
17884 if (!rtx_equal_p (target, fp_hi))
17885 emit_move_insn (target, fp_hi);
17888 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
17889 a vector of unsigned ints VAL to vector of floats TARGET. */
17891 void
17892 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
17894 rtx tmp[8];
17895 REAL_VALUE_TYPE TWO16r;
17896 enum machine_mode intmode = GET_MODE (val);
17897 enum machine_mode fltmode = GET_MODE (target);
17898 rtx (*cvt) (rtx, rtx);
17900 if (intmode == V4SImode)
17901 cvt = gen_floatv4siv4sf2;
17902 else
17903 cvt = gen_floatv8siv8sf2;
17904 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
17905 tmp[0] = force_reg (intmode, tmp[0]);
17906 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
17907 OPTAB_DIRECT);
17908 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
17909 NULL_RTX, 1, OPTAB_DIRECT);
17910 tmp[3] = gen_reg_rtx (fltmode);
17911 emit_insn (cvt (tmp[3], tmp[1]));
17912 tmp[4] = gen_reg_rtx (fltmode);
17913 emit_insn (cvt (tmp[4], tmp[2]));
17914 real_ldexp (&TWO16r, &dconst1, 16);
17915 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
17916 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
17917 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
17918 OPTAB_DIRECT);
17919 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
17920 OPTAB_DIRECT);
17921 if (tmp[7] != target)
17922 emit_move_insn (target, tmp[7]);
17925 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
17926 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
17927 This is done by doing just signed conversion if < 0x1p31, and otherwise by
17928 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
17931 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
17933 REAL_VALUE_TYPE TWO31r;
17934 rtx two31r, tmp[4];
17935 enum machine_mode mode = GET_MODE (val);
17936 enum machine_mode scalarmode = GET_MODE_INNER (mode);
17937 enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
17938 rtx (*cmp) (rtx, rtx, rtx, rtx);
17939 int i;
17941 for (i = 0; i < 3; i++)
17942 tmp[i] = gen_reg_rtx (mode);
17943 real_ldexp (&TWO31r, &dconst1, 31);
17944 two31r = const_double_from_real_value (TWO31r, scalarmode);
17945 two31r = ix86_build_const_vector (mode, 1, two31r);
17946 two31r = force_reg (mode, two31r);
17947 switch (mode)
17949 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
17950 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
17951 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
17952 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
17953 default: gcc_unreachable ();
17955 tmp[3] = gen_rtx_LE (mode, two31r, val);
17956 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
17957 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
17958 0, OPTAB_DIRECT);
17959 if (intmode == V4SImode || TARGET_AVX2)
17960 *xorp = expand_simple_binop (intmode, ASHIFT,
17961 gen_lowpart (intmode, tmp[0]),
17962 GEN_INT (31), NULL_RTX, 0,
17963 OPTAB_DIRECT);
17964 else
17966 rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
17967 two31 = ix86_build_const_vector (intmode, 1, two31);
17968 *xorp = expand_simple_binop (intmode, AND,
17969 gen_lowpart (intmode, tmp[0]),
17970 two31, NULL_RTX, 0,
17971 OPTAB_DIRECT);
17973 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
17974 0, OPTAB_DIRECT);
17977 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
17978 then replicate the value for all elements of the vector
17979 register. */
17982 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
17984 int i, n_elt;
17985 rtvec v;
17986 enum machine_mode scalar_mode;
17988 switch (mode)
17990 case V32QImode:
17991 case V16QImode:
17992 case V16HImode:
17993 case V8HImode:
17994 case V8SImode:
17995 case V4SImode:
17996 case V4DImode:
17997 case V2DImode:
17998 gcc_assert (vect);
17999 case V8SFmode:
18000 case V4SFmode:
18001 case V4DFmode:
18002 case V2DFmode:
18003 n_elt = GET_MODE_NUNITS (mode);
18004 v = rtvec_alloc (n_elt);
18005 scalar_mode = GET_MODE_INNER (mode);
18007 RTVEC_ELT (v, 0) = value;
18009 for (i = 1; i < n_elt; ++i)
18010 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
18012 return gen_rtx_CONST_VECTOR (mode, v);
18014 default:
18015 gcc_unreachable ();
18019 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
18020 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
18021 for an SSE register. If VECT is true, then replicate the mask for
18022 all elements of the vector register. If INVERT is true, then create
18023 a mask excluding the sign bit. */
18026 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
18028 enum machine_mode vec_mode, imode;
18029 HOST_WIDE_INT hi, lo;
18030 int shift = 63;
18031 rtx v;
18032 rtx mask;
18034 /* Find the sign bit, sign extended to 2*HWI. */
18035 switch (mode)
18037 case V8SImode:
18038 case V4SImode:
18039 case V8SFmode:
18040 case V4SFmode:
18041 vec_mode = mode;
18042 mode = GET_MODE_INNER (mode);
18043 imode = SImode;
18044 lo = 0x80000000, hi = lo < 0;
18045 break;
18047 case V4DImode:
18048 case V2DImode:
18049 case V4DFmode:
18050 case V2DFmode:
18051 vec_mode = mode;
18052 mode = GET_MODE_INNER (mode);
18053 imode = DImode;
18054 if (HOST_BITS_PER_WIDE_INT >= 64)
18055 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
18056 else
18057 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18058 break;
18060 case TImode:
18061 case TFmode:
18062 vec_mode = VOIDmode;
18063 if (HOST_BITS_PER_WIDE_INT >= 64)
18065 imode = TImode;
18066 lo = 0, hi = (HOST_WIDE_INT)1 << shift;
18068 else
18070 rtvec vec;
18072 imode = DImode;
18073 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18075 if (invert)
18077 lo = ~lo, hi = ~hi;
18078 v = constm1_rtx;
18080 else
18081 v = const0_rtx;
18083 mask = immed_double_const (lo, hi, imode);
18085 vec = gen_rtvec (2, v, mask);
18086 v = gen_rtx_CONST_VECTOR (V2DImode, vec);
18087 v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
18089 return v;
18091 break;
18093 default:
18094 gcc_unreachable ();
18097 if (invert)
18098 lo = ~lo, hi = ~hi;
18100 /* Force this value into the low part of a fp vector constant. */
18101 mask = immed_double_const (lo, hi, imode);
18102 mask = gen_lowpart (mode, mask);
18104 if (vec_mode == VOIDmode)
18105 return force_reg (mode, mask);
18107 v = ix86_build_const_vector (vec_mode, vect, mask);
18108 return force_reg (vec_mode, v);
18111 /* Generate code for floating point ABS or NEG. */
18113 void
18114 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
18115 rtx operands[])
18117 rtx mask, set, dst, src;
18118 bool use_sse = false;
18119 bool vector_mode = VECTOR_MODE_P (mode);
18120 enum machine_mode vmode = mode;
18122 if (vector_mode)
18123 use_sse = true;
18124 else if (mode == TFmode)
18125 use_sse = true;
18126 else if (TARGET_SSE_MATH)
18128 use_sse = SSE_FLOAT_MODE_P (mode);
18129 if (mode == SFmode)
18130 vmode = V4SFmode;
18131 else if (mode == DFmode)
18132 vmode = V2DFmode;
18135 /* NEG and ABS performed with SSE use bitwise mask operations.
18136 Create the appropriate mask now. */
18137 if (use_sse)
18138 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
18139 else
18140 mask = NULL_RTX;
18142 dst = operands[0];
18143 src = operands[1];
18145 set = gen_rtx_fmt_e (code, mode, src);
18146 set = gen_rtx_SET (VOIDmode, dst, set);
18148 if (mask)
18150 rtx use, clob;
18151 rtvec par;
18153 use = gen_rtx_USE (VOIDmode, mask);
18154 if (vector_mode)
18155 par = gen_rtvec (2, set, use);
18156 else
18158 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
18159 par = gen_rtvec (3, set, use, clob);
18161 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
18163 else
18164 emit_insn (set);
18167 /* Expand a copysign operation. Special case operand 0 being a constant. */
18169 void
18170 ix86_expand_copysign (rtx operands[])
18172 enum machine_mode mode, vmode;
18173 rtx dest, op0, op1, mask, nmask;
18175 dest = operands[0];
18176 op0 = operands[1];
18177 op1 = operands[2];
18179 mode = GET_MODE (dest);
18181 if (mode == SFmode)
18182 vmode = V4SFmode;
18183 else if (mode == DFmode)
18184 vmode = V2DFmode;
18185 else
18186 vmode = mode;
18188 if (GET_CODE (op0) == CONST_DOUBLE)
18190 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
18192 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
18193 op0 = simplify_unary_operation (ABS, mode, op0, mode);
18195 if (mode == SFmode || mode == DFmode)
18197 if (op0 == CONST0_RTX (mode))
18198 op0 = CONST0_RTX (vmode);
18199 else
18201 rtx v = ix86_build_const_vector (vmode, false, op0);
18203 op0 = force_reg (vmode, v);
18206 else if (op0 != CONST0_RTX (mode))
18207 op0 = force_reg (mode, op0);
18209 mask = ix86_build_signbit_mask (vmode, 0, 0);
18211 if (mode == SFmode)
18212 copysign_insn = gen_copysignsf3_const;
18213 else if (mode == DFmode)
18214 copysign_insn = gen_copysigndf3_const;
18215 else
18216 copysign_insn = gen_copysigntf3_const;
18218 emit_insn (copysign_insn (dest, op0, op1, mask));
18220 else
18222 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
18224 nmask = ix86_build_signbit_mask (vmode, 0, 1);
18225 mask = ix86_build_signbit_mask (vmode, 0, 0);
18227 if (mode == SFmode)
18228 copysign_insn = gen_copysignsf3_var;
18229 else if (mode == DFmode)
18230 copysign_insn = gen_copysigndf3_var;
18231 else
18232 copysign_insn = gen_copysigntf3_var;
18234 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
18238 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
18239 be a constant, and so has already been expanded into a vector constant. */
18241 void
18242 ix86_split_copysign_const (rtx operands[])
18244 enum machine_mode mode, vmode;
18245 rtx dest, op0, mask, x;
18247 dest = operands[0];
18248 op0 = operands[1];
18249 mask = operands[3];
18251 mode = GET_MODE (dest);
18252 vmode = GET_MODE (mask);
18254 dest = simplify_gen_subreg (vmode, dest, mode, 0);
18255 x = gen_rtx_AND (vmode, dest, mask);
18256 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18258 if (op0 != CONST0_RTX (vmode))
18260 x = gen_rtx_IOR (vmode, dest, op0);
18261 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18265 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
18266 so we have to do two masks. */
18268 void
18269 ix86_split_copysign_var (rtx operands[])
18271 enum machine_mode mode, vmode;
18272 rtx dest, scratch, op0, op1, mask, nmask, x;
18274 dest = operands[0];
18275 scratch = operands[1];
18276 op0 = operands[2];
18277 op1 = operands[3];
18278 nmask = operands[4];
18279 mask = operands[5];
18281 mode = GET_MODE (dest);
18282 vmode = GET_MODE (mask);
18284 if (rtx_equal_p (op0, op1))
18286 /* Shouldn't happen often (it's useless, obviously), but when it does
18287 we'd generate incorrect code if we continue below. */
18288 emit_move_insn (dest, op0);
18289 return;
18292 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
18294 gcc_assert (REGNO (op1) == REGNO (scratch));
18296 x = gen_rtx_AND (vmode, scratch, mask);
18297 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18299 dest = mask;
18300 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18301 x = gen_rtx_NOT (vmode, dest);
18302 x = gen_rtx_AND (vmode, x, op0);
18303 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18305 else
18307 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
18309 x = gen_rtx_AND (vmode, scratch, mask);
18311 else /* alternative 2,4 */
18313 gcc_assert (REGNO (mask) == REGNO (scratch));
18314 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
18315 x = gen_rtx_AND (vmode, scratch, op1);
18317 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18319 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
18321 dest = simplify_gen_subreg (vmode, op0, mode, 0);
18322 x = gen_rtx_AND (vmode, dest, nmask);
18324 else /* alternative 3,4 */
18326 gcc_assert (REGNO (nmask) == REGNO (dest));
18327 dest = nmask;
18328 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18329 x = gen_rtx_AND (vmode, dest, op0);
18331 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18334 x = gen_rtx_IOR (vmode, dest, scratch);
18335 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18338 /* Return TRUE or FALSE depending on whether the first SET in INSN
18339 has source and destination with matching CC modes, and that the
18340 CC mode is at least as constrained as REQ_MODE. */
18342 bool
18343 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
18345 rtx set;
18346 enum machine_mode set_mode;
18348 set = PATTERN (insn);
18349 if (GET_CODE (set) == PARALLEL)
18350 set = XVECEXP (set, 0, 0);
18351 gcc_assert (GET_CODE (set) == SET);
18352 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
18354 set_mode = GET_MODE (SET_DEST (set));
18355 switch (set_mode)
18357 case CCNOmode:
18358 if (req_mode != CCNOmode
18359 && (req_mode != CCmode
18360 || XEXP (SET_SRC (set), 1) != const0_rtx))
18361 return false;
18362 break;
18363 case CCmode:
18364 if (req_mode == CCGCmode)
18365 return false;
18366 /* FALLTHRU */
18367 case CCGCmode:
18368 if (req_mode == CCGOCmode || req_mode == CCNOmode)
18369 return false;
18370 /* FALLTHRU */
18371 case CCGOCmode:
18372 if (req_mode == CCZmode)
18373 return false;
18374 /* FALLTHRU */
18375 case CCZmode:
18376 break;
18378 case CCAmode:
18379 case CCCmode:
18380 case CCOmode:
18381 case CCSmode:
18382 if (set_mode != req_mode)
18383 return false;
18384 break;
18386 default:
18387 gcc_unreachable ();
18390 return GET_MODE (SET_SRC (set)) == set_mode;
18393 /* Generate insn patterns to do an integer compare of OPERANDS. */
18395 static rtx
18396 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
18398 enum machine_mode cmpmode;
18399 rtx tmp, flags;
18401 cmpmode = SELECT_CC_MODE (code, op0, op1);
18402 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
18404 /* This is very simple, but making the interface the same as in the
18405 FP case makes the rest of the code easier. */
18406 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
18407 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
18409 /* Return the test that should be put into the flags user, i.e.
18410 the bcc, scc, or cmov instruction. */
18411 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
18414 /* Figure out whether to use ordered or unordered fp comparisons.
18415 Return the appropriate mode to use. */
18417 enum machine_mode
18418 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
18420 /* ??? In order to make all comparisons reversible, we do all comparisons
18421 non-trapping when compiling for IEEE. Once gcc is able to distinguish
18422 all forms trapping and nontrapping comparisons, we can make inequality
18423 comparisons trapping again, since it results in better code when using
18424 FCOM based compares. */
18425 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
18428 enum machine_mode
18429 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
18431 enum machine_mode mode = GET_MODE (op0);
18433 if (SCALAR_FLOAT_MODE_P (mode))
18435 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18436 return ix86_fp_compare_mode (code);
18439 switch (code)
18441 /* Only zero flag is needed. */
18442 case EQ: /* ZF=0 */
18443 case NE: /* ZF!=0 */
18444 return CCZmode;
18445 /* Codes needing carry flag. */
18446 case GEU: /* CF=0 */
18447 case LTU: /* CF=1 */
18448 /* Detect overflow checks. They need just the carry flag. */
18449 if (GET_CODE (op0) == PLUS
18450 && rtx_equal_p (op1, XEXP (op0, 0)))
18451 return CCCmode;
18452 else
18453 return CCmode;
18454 case GTU: /* CF=0 & ZF=0 */
18455 case LEU: /* CF=1 | ZF=1 */
18456 /* Detect overflow checks. They need just the carry flag. */
18457 if (GET_CODE (op0) == MINUS
18458 && rtx_equal_p (op1, XEXP (op0, 0)))
18459 return CCCmode;
18460 else
18461 return CCmode;
18462 /* Codes possibly doable only with sign flag when
18463 comparing against zero. */
18464 case GE: /* SF=OF or SF=0 */
18465 case LT: /* SF<>OF or SF=1 */
18466 if (op1 == const0_rtx)
18467 return CCGOCmode;
18468 else
18469 /* For other cases Carry flag is not required. */
18470 return CCGCmode;
18471 /* Codes doable only with sign flag when comparing
18472 against zero, but we miss jump instruction for it
18473 so we need to use relational tests against overflow
18474 that thus needs to be zero. */
18475 case GT: /* ZF=0 & SF=OF */
18476 case LE: /* ZF=1 | SF<>OF */
18477 if (op1 == const0_rtx)
18478 return CCNOmode;
18479 else
18480 return CCGCmode;
18481 /* strcmp pattern do (use flags) and combine may ask us for proper
18482 mode. */
18483 case USE:
18484 return CCmode;
18485 default:
18486 gcc_unreachable ();
18490 /* Return the fixed registers used for condition codes. */
18492 static bool
18493 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
18495 *p1 = FLAGS_REG;
18496 *p2 = FPSR_REG;
18497 return true;
18500 /* If two condition code modes are compatible, return a condition code
18501 mode which is compatible with both. Otherwise, return
18502 VOIDmode. */
18504 static enum machine_mode
18505 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
18507 if (m1 == m2)
18508 return m1;
18510 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
18511 return VOIDmode;
18513 if ((m1 == CCGCmode && m2 == CCGOCmode)
18514 || (m1 == CCGOCmode && m2 == CCGCmode))
18515 return CCGCmode;
18517 if (m1 == CCZmode && (m2 == CCGCmode || m2 == CCGOCmode))
18518 return m2;
18519 else if (m2 == CCZmode && (m1 == CCGCmode || m1 == CCGOCmode))
18520 return m1;
18522 switch (m1)
18524 default:
18525 gcc_unreachable ();
18527 case CCmode:
18528 case CCGCmode:
18529 case CCGOCmode:
18530 case CCNOmode:
18531 case CCAmode:
18532 case CCCmode:
18533 case CCOmode:
18534 case CCSmode:
18535 case CCZmode:
18536 switch (m2)
18538 default:
18539 return VOIDmode;
18541 case CCmode:
18542 case CCGCmode:
18543 case CCGOCmode:
18544 case CCNOmode:
18545 case CCAmode:
18546 case CCCmode:
18547 case CCOmode:
18548 case CCSmode:
18549 case CCZmode:
18550 return CCmode;
18553 case CCFPmode:
18554 case CCFPUmode:
18555 /* These are only compatible with themselves, which we already
18556 checked above. */
18557 return VOIDmode;
18562 /* Return a comparison we can do and that it is equivalent to
18563 swap_condition (code) apart possibly from orderedness.
18564 But, never change orderedness if TARGET_IEEE_FP, returning
18565 UNKNOWN in that case if necessary. */
18567 static enum rtx_code
18568 ix86_fp_swap_condition (enum rtx_code code)
18570 switch (code)
18572 case GT: /* GTU - CF=0 & ZF=0 */
18573 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
18574 case GE: /* GEU - CF=0 */
18575 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
18576 case UNLT: /* LTU - CF=1 */
18577 return TARGET_IEEE_FP ? UNKNOWN : GT;
18578 case UNLE: /* LEU - CF=1 | ZF=1 */
18579 return TARGET_IEEE_FP ? UNKNOWN : GE;
18580 default:
18581 return swap_condition (code);
18585 /* Return cost of comparison CODE using the best strategy for performance.
18586 All following functions do use number of instructions as a cost metrics.
18587 In future this should be tweaked to compute bytes for optimize_size and
18588 take into account performance of various instructions on various CPUs. */
18590 static int
18591 ix86_fp_comparison_cost (enum rtx_code code)
18593 int arith_cost;
18595 /* The cost of code using bit-twiddling on %ah. */
18596 switch (code)
18598 case UNLE:
18599 case UNLT:
18600 case LTGT:
18601 case GT:
18602 case GE:
18603 case UNORDERED:
18604 case ORDERED:
18605 case UNEQ:
18606 arith_cost = 4;
18607 break;
18608 case LT:
18609 case NE:
18610 case EQ:
18611 case UNGE:
18612 arith_cost = TARGET_IEEE_FP ? 5 : 4;
18613 break;
18614 case LE:
18615 case UNGT:
18616 arith_cost = TARGET_IEEE_FP ? 6 : 4;
18617 break;
18618 default:
18619 gcc_unreachable ();
18622 switch (ix86_fp_comparison_strategy (code))
18624 case IX86_FPCMP_COMI:
18625 return arith_cost > 4 ? 3 : 2;
18626 case IX86_FPCMP_SAHF:
18627 return arith_cost > 4 ? 4 : 3;
18628 default:
18629 return arith_cost;
18633 /* Return strategy to use for floating-point. We assume that fcomi is always
18634 preferrable where available, since that is also true when looking at size
18635 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
18637 enum ix86_fpcmp_strategy
18638 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
18640 /* Do fcomi/sahf based test when profitable. */
18642 if (TARGET_CMOVE)
18643 return IX86_FPCMP_COMI;
18645 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_function_for_size_p (cfun)))
18646 return IX86_FPCMP_SAHF;
18648 return IX86_FPCMP_ARITH;
18651 /* Swap, force into registers, or otherwise massage the two operands
18652 to a fp comparison. The operands are updated in place; the new
18653 comparison code is returned. */
18655 static enum rtx_code
18656 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
18658 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
18659 rtx op0 = *pop0, op1 = *pop1;
18660 enum machine_mode op_mode = GET_MODE (op0);
18661 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
18663 /* All of the unordered compare instructions only work on registers.
18664 The same is true of the fcomi compare instructions. The XFmode
18665 compare instructions require registers except when comparing
18666 against zero or when converting operand 1 from fixed point to
18667 floating point. */
18669 if (!is_sse
18670 && (fpcmp_mode == CCFPUmode
18671 || (op_mode == XFmode
18672 && ! (standard_80387_constant_p (op0) == 1
18673 || standard_80387_constant_p (op1) == 1)
18674 && GET_CODE (op1) != FLOAT)
18675 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
18677 op0 = force_reg (op_mode, op0);
18678 op1 = force_reg (op_mode, op1);
18680 else
18682 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
18683 things around if they appear profitable, otherwise force op0
18684 into a register. */
18686 if (standard_80387_constant_p (op0) == 0
18687 || (MEM_P (op0)
18688 && ! (standard_80387_constant_p (op1) == 0
18689 || MEM_P (op1))))
18691 enum rtx_code new_code = ix86_fp_swap_condition (code);
18692 if (new_code != UNKNOWN)
18694 rtx tmp;
18695 tmp = op0, op0 = op1, op1 = tmp;
18696 code = new_code;
18700 if (!REG_P (op0))
18701 op0 = force_reg (op_mode, op0);
18703 if (CONSTANT_P (op1))
18705 int tmp = standard_80387_constant_p (op1);
18706 if (tmp == 0)
18707 op1 = validize_mem (force_const_mem (op_mode, op1));
18708 else if (tmp == 1)
18710 if (TARGET_CMOVE)
18711 op1 = force_reg (op_mode, op1);
18713 else
18714 op1 = force_reg (op_mode, op1);
18718 /* Try to rearrange the comparison to make it cheaper. */
18719 if (ix86_fp_comparison_cost (code)
18720 > ix86_fp_comparison_cost (swap_condition (code))
18721 && (REG_P (op1) || can_create_pseudo_p ()))
18723 rtx tmp;
18724 tmp = op0, op0 = op1, op1 = tmp;
18725 code = swap_condition (code);
18726 if (!REG_P (op0))
18727 op0 = force_reg (op_mode, op0);
18730 *pop0 = op0;
18731 *pop1 = op1;
18732 return code;
18735 /* Convert comparison codes we use to represent FP comparison to integer
18736 code that will result in proper branch. Return UNKNOWN if no such code
18737 is available. */
18739 enum rtx_code
18740 ix86_fp_compare_code_to_integer (enum rtx_code code)
18742 switch (code)
18744 case GT:
18745 return GTU;
18746 case GE:
18747 return GEU;
18748 case ORDERED:
18749 case UNORDERED:
18750 return code;
18751 break;
18752 case UNEQ:
18753 return EQ;
18754 break;
18755 case UNLT:
18756 return LTU;
18757 break;
18758 case UNLE:
18759 return LEU;
18760 break;
18761 case LTGT:
18762 return NE;
18763 break;
18764 default:
18765 return UNKNOWN;
18769 /* Generate insn patterns to do a floating point compare of OPERANDS. */
18771 static rtx
18772 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
18774 enum machine_mode fpcmp_mode, intcmp_mode;
18775 rtx tmp, tmp2;
18777 fpcmp_mode = ix86_fp_compare_mode (code);
18778 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
18780 /* Do fcomi/sahf based test when profitable. */
18781 switch (ix86_fp_comparison_strategy (code))
18783 case IX86_FPCMP_COMI:
18784 intcmp_mode = fpcmp_mode;
18785 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18786 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18787 tmp);
18788 emit_insn (tmp);
18789 break;
18791 case IX86_FPCMP_SAHF:
18792 intcmp_mode = fpcmp_mode;
18793 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18794 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18795 tmp);
18797 if (!scratch)
18798 scratch = gen_reg_rtx (HImode);
18799 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
18800 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
18801 break;
18803 case IX86_FPCMP_ARITH:
18804 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
18805 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18806 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
18807 if (!scratch)
18808 scratch = gen_reg_rtx (HImode);
18809 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
18811 /* In the unordered case, we have to check C2 for NaN's, which
18812 doesn't happen to work out to anything nice combination-wise.
18813 So do some bit twiddling on the value we've got in AH to come
18814 up with an appropriate set of condition codes. */
18816 intcmp_mode = CCNOmode;
18817 switch (code)
18819 case GT:
18820 case UNGT:
18821 if (code == GT || !TARGET_IEEE_FP)
18823 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18824 code = EQ;
18826 else
18828 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18829 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18830 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
18831 intcmp_mode = CCmode;
18832 code = GEU;
18834 break;
18835 case LT:
18836 case UNLT:
18837 if (code == LT && TARGET_IEEE_FP)
18839 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18840 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
18841 intcmp_mode = CCmode;
18842 code = EQ;
18844 else
18846 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
18847 code = NE;
18849 break;
18850 case GE:
18851 case UNGE:
18852 if (code == GE || !TARGET_IEEE_FP)
18854 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
18855 code = EQ;
18857 else
18859 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18860 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
18861 code = NE;
18863 break;
18864 case LE:
18865 case UNLE:
18866 if (code == LE && TARGET_IEEE_FP)
18868 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18869 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18870 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18871 intcmp_mode = CCmode;
18872 code = LTU;
18874 else
18876 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18877 code = NE;
18879 break;
18880 case EQ:
18881 case UNEQ:
18882 if (code == EQ && TARGET_IEEE_FP)
18884 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18885 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18886 intcmp_mode = CCmode;
18887 code = EQ;
18889 else
18891 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18892 code = NE;
18894 break;
18895 case NE:
18896 case LTGT:
18897 if (code == NE && TARGET_IEEE_FP)
18899 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18900 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
18901 GEN_INT (0x40)));
18902 code = NE;
18904 else
18906 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18907 code = EQ;
18909 break;
18911 case UNORDERED:
18912 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18913 code = NE;
18914 break;
18915 case ORDERED:
18916 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18917 code = EQ;
18918 break;
18920 default:
18921 gcc_unreachable ();
18923 break;
18925 default:
18926 gcc_unreachable();
18929 /* Return the test that should be put into the flags user, i.e.
18930 the bcc, scc, or cmov instruction. */
18931 return gen_rtx_fmt_ee (code, VOIDmode,
18932 gen_rtx_REG (intcmp_mode, FLAGS_REG),
18933 const0_rtx);
18936 static rtx
18937 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
18939 rtx ret;
18941 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
18942 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
18944 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
18946 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
18947 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18949 else
18950 ret = ix86_expand_int_compare (code, op0, op1);
18952 return ret;
18955 void
18956 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
18958 enum machine_mode mode = GET_MODE (op0);
18959 rtx tmp;
18961 switch (mode)
18963 case SFmode:
18964 case DFmode:
18965 case XFmode:
18966 case QImode:
18967 case HImode:
18968 case SImode:
18969 simple:
18970 tmp = ix86_expand_compare (code, op0, op1);
18971 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
18972 gen_rtx_LABEL_REF (VOIDmode, label),
18973 pc_rtx);
18974 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
18975 return;
18977 case DImode:
18978 if (TARGET_64BIT)
18979 goto simple;
18980 case TImode:
18981 /* Expand DImode branch into multiple compare+branch. */
18983 rtx lo[2], hi[2], label2;
18984 enum rtx_code code1, code2, code3;
18985 enum machine_mode submode;
18987 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
18989 tmp = op0, op0 = op1, op1 = tmp;
18990 code = swap_condition (code);
18993 split_double_mode (mode, &op0, 1, lo+0, hi+0);
18994 split_double_mode (mode, &op1, 1, lo+1, hi+1);
18996 submode = mode == DImode ? SImode : DImode;
18998 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
18999 avoid two branches. This costs one extra insn, so disable when
19000 optimizing for size. */
19002 if ((code == EQ || code == NE)
19003 && (!optimize_insn_for_size_p ()
19004 || hi[1] == const0_rtx || lo[1] == const0_rtx))
19006 rtx xor0, xor1;
19008 xor1 = hi[0];
19009 if (hi[1] != const0_rtx)
19010 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
19011 NULL_RTX, 0, OPTAB_WIDEN);
19013 xor0 = lo[0];
19014 if (lo[1] != const0_rtx)
19015 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
19016 NULL_RTX, 0, OPTAB_WIDEN);
19018 tmp = expand_binop (submode, ior_optab, xor1, xor0,
19019 NULL_RTX, 0, OPTAB_WIDEN);
19021 ix86_expand_branch (code, tmp, const0_rtx, label);
19022 return;
19025 /* Otherwise, if we are doing less-than or greater-or-equal-than,
19026 op1 is a constant and the low word is zero, then we can just
19027 examine the high word. Similarly for low word -1 and
19028 less-or-equal-than or greater-than. */
19030 if (CONST_INT_P (hi[1]))
19031 switch (code)
19033 case LT: case LTU: case GE: case GEU:
19034 if (lo[1] == const0_rtx)
19036 ix86_expand_branch (code, hi[0], hi[1], label);
19037 return;
19039 break;
19040 case LE: case LEU: case GT: case GTU:
19041 if (lo[1] == constm1_rtx)
19043 ix86_expand_branch (code, hi[0], hi[1], label);
19044 return;
19046 break;
19047 default:
19048 break;
19051 /* Otherwise, we need two or three jumps. */
19053 label2 = gen_label_rtx ();
19055 code1 = code;
19056 code2 = swap_condition (code);
19057 code3 = unsigned_condition (code);
19059 switch (code)
19061 case LT: case GT: case LTU: case GTU:
19062 break;
19064 case LE: code1 = LT; code2 = GT; break;
19065 case GE: code1 = GT; code2 = LT; break;
19066 case LEU: code1 = LTU; code2 = GTU; break;
19067 case GEU: code1 = GTU; code2 = LTU; break;
19069 case EQ: code1 = UNKNOWN; code2 = NE; break;
19070 case NE: code2 = UNKNOWN; break;
19072 default:
19073 gcc_unreachable ();
19077 * a < b =>
19078 * if (hi(a) < hi(b)) goto true;
19079 * if (hi(a) > hi(b)) goto false;
19080 * if (lo(a) < lo(b)) goto true;
19081 * false:
19084 if (code1 != UNKNOWN)
19085 ix86_expand_branch (code1, hi[0], hi[1], label);
19086 if (code2 != UNKNOWN)
19087 ix86_expand_branch (code2, hi[0], hi[1], label2);
19089 ix86_expand_branch (code3, lo[0], lo[1], label);
19091 if (code2 != UNKNOWN)
19092 emit_label (label2);
19093 return;
19096 default:
19097 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
19098 goto simple;
19102 /* Split branch based on floating point condition. */
19103 void
19104 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
19105 rtx target1, rtx target2, rtx tmp, rtx pushed)
19107 rtx condition;
19108 rtx i;
19110 if (target2 != pc_rtx)
19112 rtx tmp = target2;
19113 code = reverse_condition_maybe_unordered (code);
19114 target2 = target1;
19115 target1 = tmp;
19118 condition = ix86_expand_fp_compare (code, op1, op2,
19119 tmp);
19121 /* Remove pushed operand from stack. */
19122 if (pushed)
19123 ix86_free_from_memory (GET_MODE (pushed));
19125 i = emit_jump_insn (gen_rtx_SET
19126 (VOIDmode, pc_rtx,
19127 gen_rtx_IF_THEN_ELSE (VOIDmode,
19128 condition, target1, target2)));
19129 if (split_branch_probability >= 0)
19130 add_reg_note (i, REG_BR_PROB, GEN_INT (split_branch_probability));
19133 void
19134 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
19136 rtx ret;
19138 gcc_assert (GET_MODE (dest) == QImode);
19140 ret = ix86_expand_compare (code, op0, op1);
19141 PUT_MODE (ret, QImode);
19142 emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
19145 /* Expand comparison setting or clearing carry flag. Return true when
19146 successful and set pop for the operation. */
19147 static bool
19148 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
19150 enum machine_mode mode =
19151 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
19153 /* Do not handle double-mode compares that go through special path. */
19154 if (mode == (TARGET_64BIT ? TImode : DImode))
19155 return false;
19157 if (SCALAR_FLOAT_MODE_P (mode))
19159 rtx compare_op, compare_seq;
19161 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
19163 /* Shortcut: following common codes never translate
19164 into carry flag compares. */
19165 if (code == EQ || code == NE || code == UNEQ || code == LTGT
19166 || code == ORDERED || code == UNORDERED)
19167 return false;
19169 /* These comparisons require zero flag; swap operands so they won't. */
19170 if ((code == GT || code == UNLE || code == LE || code == UNGT)
19171 && !TARGET_IEEE_FP)
19173 rtx tmp = op0;
19174 op0 = op1;
19175 op1 = tmp;
19176 code = swap_condition (code);
19179 /* Try to expand the comparison and verify that we end up with
19180 carry flag based comparison. This fails to be true only when
19181 we decide to expand comparison using arithmetic that is not
19182 too common scenario. */
19183 start_sequence ();
19184 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
19185 compare_seq = get_insns ();
19186 end_sequence ();
19188 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
19189 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
19190 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
19191 else
19192 code = GET_CODE (compare_op);
19194 if (code != LTU && code != GEU)
19195 return false;
19197 emit_insn (compare_seq);
19198 *pop = compare_op;
19199 return true;
19202 if (!INTEGRAL_MODE_P (mode))
19203 return false;
19205 switch (code)
19207 case LTU:
19208 case GEU:
19209 break;
19211 /* Convert a==0 into (unsigned)a<1. */
19212 case EQ:
19213 case NE:
19214 if (op1 != const0_rtx)
19215 return false;
19216 op1 = const1_rtx;
19217 code = (code == EQ ? LTU : GEU);
19218 break;
19220 /* Convert a>b into b<a or a>=b-1. */
19221 case GTU:
19222 case LEU:
19223 if (CONST_INT_P (op1))
19225 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
19226 /* Bail out on overflow. We still can swap operands but that
19227 would force loading of the constant into register. */
19228 if (op1 == const0_rtx
19229 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
19230 return false;
19231 code = (code == GTU ? GEU : LTU);
19233 else
19235 rtx tmp = op1;
19236 op1 = op0;
19237 op0 = tmp;
19238 code = (code == GTU ? LTU : GEU);
19240 break;
19242 /* Convert a>=0 into (unsigned)a<0x80000000. */
19243 case LT:
19244 case GE:
19245 if (mode == DImode || op1 != const0_rtx)
19246 return false;
19247 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19248 code = (code == LT ? GEU : LTU);
19249 break;
19250 case LE:
19251 case GT:
19252 if (mode == DImode || op1 != constm1_rtx)
19253 return false;
19254 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19255 code = (code == LE ? GEU : LTU);
19256 break;
19258 default:
19259 return false;
19261 /* Swapping operands may cause constant to appear as first operand. */
19262 if (!nonimmediate_operand (op0, VOIDmode))
19264 if (!can_create_pseudo_p ())
19265 return false;
19266 op0 = force_reg (mode, op0);
19268 *pop = ix86_expand_compare (code, op0, op1);
19269 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
19270 return true;
19273 bool
19274 ix86_expand_int_movcc (rtx operands[])
19276 enum rtx_code code = GET_CODE (operands[1]), compare_code;
19277 rtx compare_seq, compare_op;
19278 enum machine_mode mode = GET_MODE (operands[0]);
19279 bool sign_bit_compare_p = false;
19280 rtx op0 = XEXP (operands[1], 0);
19281 rtx op1 = XEXP (operands[1], 1);
19283 if (GET_MODE (op0) == TImode
19284 || (GET_MODE (op0) == DImode
19285 && !TARGET_64BIT))
19286 return false;
19288 start_sequence ();
19289 compare_op = ix86_expand_compare (code, op0, op1);
19290 compare_seq = get_insns ();
19291 end_sequence ();
19293 compare_code = GET_CODE (compare_op);
19295 if ((op1 == const0_rtx && (code == GE || code == LT))
19296 || (op1 == constm1_rtx && (code == GT || code == LE)))
19297 sign_bit_compare_p = true;
19299 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
19300 HImode insns, we'd be swallowed in word prefix ops. */
19302 if ((mode != HImode || TARGET_FAST_PREFIX)
19303 && (mode != (TARGET_64BIT ? TImode : DImode))
19304 && CONST_INT_P (operands[2])
19305 && CONST_INT_P (operands[3]))
19307 rtx out = operands[0];
19308 HOST_WIDE_INT ct = INTVAL (operands[2]);
19309 HOST_WIDE_INT cf = INTVAL (operands[3]);
19310 HOST_WIDE_INT diff;
19312 diff = ct - cf;
19313 /* Sign bit compares are better done using shifts than we do by using
19314 sbb. */
19315 if (sign_bit_compare_p
19316 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
19318 /* Detect overlap between destination and compare sources. */
19319 rtx tmp = out;
19321 if (!sign_bit_compare_p)
19323 rtx flags;
19324 bool fpcmp = false;
19326 compare_code = GET_CODE (compare_op);
19328 flags = XEXP (compare_op, 0);
19330 if (GET_MODE (flags) == CCFPmode
19331 || GET_MODE (flags) == CCFPUmode)
19333 fpcmp = true;
19334 compare_code
19335 = ix86_fp_compare_code_to_integer (compare_code);
19338 /* To simplify rest of code, restrict to the GEU case. */
19339 if (compare_code == LTU)
19341 HOST_WIDE_INT tmp = ct;
19342 ct = cf;
19343 cf = tmp;
19344 compare_code = reverse_condition (compare_code);
19345 code = reverse_condition (code);
19347 else
19349 if (fpcmp)
19350 PUT_CODE (compare_op,
19351 reverse_condition_maybe_unordered
19352 (GET_CODE (compare_op)));
19353 else
19354 PUT_CODE (compare_op,
19355 reverse_condition (GET_CODE (compare_op)));
19357 diff = ct - cf;
19359 if (reg_overlap_mentioned_p (out, op0)
19360 || reg_overlap_mentioned_p (out, op1))
19361 tmp = gen_reg_rtx (mode);
19363 if (mode == DImode)
19364 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
19365 else
19366 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
19367 flags, compare_op));
19369 else
19371 if (code == GT || code == GE)
19372 code = reverse_condition (code);
19373 else
19375 HOST_WIDE_INT tmp = ct;
19376 ct = cf;
19377 cf = tmp;
19378 diff = ct - cf;
19380 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
19383 if (diff == 1)
19386 * cmpl op0,op1
19387 * sbbl dest,dest
19388 * [addl dest, ct]
19390 * Size 5 - 8.
19392 if (ct)
19393 tmp = expand_simple_binop (mode, PLUS,
19394 tmp, GEN_INT (ct),
19395 copy_rtx (tmp), 1, OPTAB_DIRECT);
19397 else if (cf == -1)
19400 * cmpl op0,op1
19401 * sbbl dest,dest
19402 * orl $ct, dest
19404 * Size 8.
19406 tmp = expand_simple_binop (mode, IOR,
19407 tmp, GEN_INT (ct),
19408 copy_rtx (tmp), 1, OPTAB_DIRECT);
19410 else if (diff == -1 && ct)
19413 * cmpl op0,op1
19414 * sbbl dest,dest
19415 * notl dest
19416 * [addl dest, cf]
19418 * Size 8 - 11.
19420 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19421 if (cf)
19422 tmp = expand_simple_binop (mode, PLUS,
19423 copy_rtx (tmp), GEN_INT (cf),
19424 copy_rtx (tmp), 1, OPTAB_DIRECT);
19426 else
19429 * cmpl op0,op1
19430 * sbbl dest,dest
19431 * [notl dest]
19432 * andl cf - ct, dest
19433 * [addl dest, ct]
19435 * Size 8 - 11.
19438 if (cf == 0)
19440 cf = ct;
19441 ct = 0;
19442 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19445 tmp = expand_simple_binop (mode, AND,
19446 copy_rtx (tmp),
19447 gen_int_mode (cf - ct, mode),
19448 copy_rtx (tmp), 1, OPTAB_DIRECT);
19449 if (ct)
19450 tmp = expand_simple_binop (mode, PLUS,
19451 copy_rtx (tmp), GEN_INT (ct),
19452 copy_rtx (tmp), 1, OPTAB_DIRECT);
19455 if (!rtx_equal_p (tmp, out))
19456 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
19458 return true;
19461 if (diff < 0)
19463 enum machine_mode cmp_mode = GET_MODE (op0);
19465 HOST_WIDE_INT tmp;
19466 tmp = ct, ct = cf, cf = tmp;
19467 diff = -diff;
19469 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19471 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19473 /* We may be reversing unordered compare to normal compare, that
19474 is not valid in general (we may convert non-trapping condition
19475 to trapping one), however on i386 we currently emit all
19476 comparisons unordered. */
19477 compare_code = reverse_condition_maybe_unordered (compare_code);
19478 code = reverse_condition_maybe_unordered (code);
19480 else
19482 compare_code = reverse_condition (compare_code);
19483 code = reverse_condition (code);
19487 compare_code = UNKNOWN;
19488 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
19489 && CONST_INT_P (op1))
19491 if (op1 == const0_rtx
19492 && (code == LT || code == GE))
19493 compare_code = code;
19494 else if (op1 == constm1_rtx)
19496 if (code == LE)
19497 compare_code = LT;
19498 else if (code == GT)
19499 compare_code = GE;
19503 /* Optimize dest = (op0 < 0) ? -1 : cf. */
19504 if (compare_code != UNKNOWN
19505 && GET_MODE (op0) == GET_MODE (out)
19506 && (cf == -1 || ct == -1))
19508 /* If lea code below could be used, only optimize
19509 if it results in a 2 insn sequence. */
19511 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
19512 || diff == 3 || diff == 5 || diff == 9)
19513 || (compare_code == LT && ct == -1)
19514 || (compare_code == GE && cf == -1))
19517 * notl op1 (if necessary)
19518 * sarl $31, op1
19519 * orl cf, op1
19521 if (ct != -1)
19523 cf = ct;
19524 ct = -1;
19525 code = reverse_condition (code);
19528 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19530 out = expand_simple_binop (mode, IOR,
19531 out, GEN_INT (cf),
19532 out, 1, OPTAB_DIRECT);
19533 if (out != operands[0])
19534 emit_move_insn (operands[0], out);
19536 return true;
19541 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
19542 || diff == 3 || diff == 5 || diff == 9)
19543 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
19544 && (mode != DImode
19545 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
19548 * xorl dest,dest
19549 * cmpl op1,op2
19550 * setcc dest
19551 * lea cf(dest*(ct-cf)),dest
19553 * Size 14.
19555 * This also catches the degenerate setcc-only case.
19558 rtx tmp;
19559 int nops;
19561 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19563 nops = 0;
19564 /* On x86_64 the lea instruction operates on Pmode, so we need
19565 to get arithmetics done in proper mode to match. */
19566 if (diff == 1)
19567 tmp = copy_rtx (out);
19568 else
19570 rtx out1;
19571 out1 = copy_rtx (out);
19572 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
19573 nops++;
19574 if (diff & 1)
19576 tmp = gen_rtx_PLUS (mode, tmp, out1);
19577 nops++;
19580 if (cf != 0)
19582 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
19583 nops++;
19585 if (!rtx_equal_p (tmp, out))
19587 if (nops == 1)
19588 out = force_operand (tmp, copy_rtx (out));
19589 else
19590 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
19592 if (!rtx_equal_p (out, operands[0]))
19593 emit_move_insn (operands[0], copy_rtx (out));
19595 return true;
19599 * General case: Jumpful:
19600 * xorl dest,dest cmpl op1, op2
19601 * cmpl op1, op2 movl ct, dest
19602 * setcc dest jcc 1f
19603 * decl dest movl cf, dest
19604 * andl (cf-ct),dest 1:
19605 * addl ct,dest
19607 * Size 20. Size 14.
19609 * This is reasonably steep, but branch mispredict costs are
19610 * high on modern cpus, so consider failing only if optimizing
19611 * for space.
19614 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19615 && BRANCH_COST (optimize_insn_for_speed_p (),
19616 false) >= 2)
19618 if (cf == 0)
19620 enum machine_mode cmp_mode = GET_MODE (op0);
19622 cf = ct;
19623 ct = 0;
19625 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19627 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19629 /* We may be reversing unordered compare to normal compare,
19630 that is not valid in general (we may convert non-trapping
19631 condition to trapping one), however on i386 we currently
19632 emit all comparisons unordered. */
19633 code = reverse_condition_maybe_unordered (code);
19635 else
19637 code = reverse_condition (code);
19638 if (compare_code != UNKNOWN)
19639 compare_code = reverse_condition (compare_code);
19643 if (compare_code != UNKNOWN)
19645 /* notl op1 (if needed)
19646 sarl $31, op1
19647 andl (cf-ct), op1
19648 addl ct, op1
19650 For x < 0 (resp. x <= -1) there will be no notl,
19651 so if possible swap the constants to get rid of the
19652 complement.
19653 True/false will be -1/0 while code below (store flag
19654 followed by decrement) is 0/-1, so the constants need
19655 to be exchanged once more. */
19657 if (compare_code == GE || !cf)
19659 code = reverse_condition (code);
19660 compare_code = LT;
19662 else
19664 HOST_WIDE_INT tmp = cf;
19665 cf = ct;
19666 ct = tmp;
19669 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19671 else
19673 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19675 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
19676 constm1_rtx,
19677 copy_rtx (out), 1, OPTAB_DIRECT);
19680 out = expand_simple_binop (mode, AND, copy_rtx (out),
19681 gen_int_mode (cf - ct, mode),
19682 copy_rtx (out), 1, OPTAB_DIRECT);
19683 if (ct)
19684 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
19685 copy_rtx (out), 1, OPTAB_DIRECT);
19686 if (!rtx_equal_p (out, operands[0]))
19687 emit_move_insn (operands[0], copy_rtx (out));
19689 return true;
19693 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19695 /* Try a few things more with specific constants and a variable. */
19697 optab op;
19698 rtx var, orig_out, out, tmp;
19700 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
19701 return false;
19703 /* If one of the two operands is an interesting constant, load a
19704 constant with the above and mask it in with a logical operation. */
19706 if (CONST_INT_P (operands[2]))
19708 var = operands[3];
19709 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
19710 operands[3] = constm1_rtx, op = and_optab;
19711 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
19712 operands[3] = const0_rtx, op = ior_optab;
19713 else
19714 return false;
19716 else if (CONST_INT_P (operands[3]))
19718 var = operands[2];
19719 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
19720 operands[2] = constm1_rtx, op = and_optab;
19721 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
19722 operands[2] = const0_rtx, op = ior_optab;
19723 else
19724 return false;
19726 else
19727 return false;
19729 orig_out = operands[0];
19730 tmp = gen_reg_rtx (mode);
19731 operands[0] = tmp;
19733 /* Recurse to get the constant loaded. */
19734 if (ix86_expand_int_movcc (operands) == 0)
19735 return false;
19737 /* Mask in the interesting variable. */
19738 out = expand_binop (mode, op, var, tmp, orig_out, 0,
19739 OPTAB_WIDEN);
19740 if (!rtx_equal_p (out, orig_out))
19741 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
19743 return true;
19747 * For comparison with above,
19749 * movl cf,dest
19750 * movl ct,tmp
19751 * cmpl op1,op2
19752 * cmovcc tmp,dest
19754 * Size 15.
19757 if (! nonimmediate_operand (operands[2], mode))
19758 operands[2] = force_reg (mode, operands[2]);
19759 if (! nonimmediate_operand (operands[3], mode))
19760 operands[3] = force_reg (mode, operands[3]);
19762 if (! register_operand (operands[2], VOIDmode)
19763 && (mode == QImode
19764 || ! register_operand (operands[3], VOIDmode)))
19765 operands[2] = force_reg (mode, operands[2]);
19767 if (mode == QImode
19768 && ! register_operand (operands[3], VOIDmode))
19769 operands[3] = force_reg (mode, operands[3]);
19771 emit_insn (compare_seq);
19772 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19773 gen_rtx_IF_THEN_ELSE (mode,
19774 compare_op, operands[2],
19775 operands[3])));
19776 return true;
19779 /* Swap, force into registers, or otherwise massage the two operands
19780 to an sse comparison with a mask result. Thus we differ a bit from
19781 ix86_prepare_fp_compare_args which expects to produce a flags result.
19783 The DEST operand exists to help determine whether to commute commutative
19784 operators. The POP0/POP1 operands are updated in place. The new
19785 comparison code is returned, or UNKNOWN if not implementable. */
19787 static enum rtx_code
19788 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
19789 rtx *pop0, rtx *pop1)
19791 rtx tmp;
19793 switch (code)
19795 case LTGT:
19796 case UNEQ:
19797 /* AVX supports all the needed comparisons. */
19798 if (TARGET_AVX)
19799 break;
19800 /* We have no LTGT as an operator. We could implement it with
19801 NE & ORDERED, but this requires an extra temporary. It's
19802 not clear that it's worth it. */
19803 return UNKNOWN;
19805 case LT:
19806 case LE:
19807 case UNGT:
19808 case UNGE:
19809 /* These are supported directly. */
19810 break;
19812 case EQ:
19813 case NE:
19814 case UNORDERED:
19815 case ORDERED:
19816 /* AVX has 3 operand comparisons, no need to swap anything. */
19817 if (TARGET_AVX)
19818 break;
19819 /* For commutative operators, try to canonicalize the destination
19820 operand to be first in the comparison - this helps reload to
19821 avoid extra moves. */
19822 if (!dest || !rtx_equal_p (dest, *pop1))
19823 break;
19824 /* FALLTHRU */
19826 case GE:
19827 case GT:
19828 case UNLE:
19829 case UNLT:
19830 /* These are not supported directly before AVX, and furthermore
19831 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
19832 comparison operands to transform into something that is
19833 supported. */
19834 tmp = *pop0;
19835 *pop0 = *pop1;
19836 *pop1 = tmp;
19837 code = swap_condition (code);
19838 break;
19840 default:
19841 gcc_unreachable ();
19844 return code;
19847 /* Detect conditional moves that exactly match min/max operational
19848 semantics. Note that this is IEEE safe, as long as we don't
19849 interchange the operands.
19851 Returns FALSE if this conditional move doesn't match a MIN/MAX,
19852 and TRUE if the operation is successful and instructions are emitted. */
19854 static bool
19855 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
19856 rtx cmp_op1, rtx if_true, rtx if_false)
19858 enum machine_mode mode;
19859 bool is_min;
19860 rtx tmp;
19862 if (code == LT)
19864 else if (code == UNGE)
19866 tmp = if_true;
19867 if_true = if_false;
19868 if_false = tmp;
19870 else
19871 return false;
19873 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
19874 is_min = true;
19875 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
19876 is_min = false;
19877 else
19878 return false;
19880 mode = GET_MODE (dest);
19882 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
19883 but MODE may be a vector mode and thus not appropriate. */
19884 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
19886 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
19887 rtvec v;
19889 if_true = force_reg (mode, if_true);
19890 v = gen_rtvec (2, if_true, if_false);
19891 tmp = gen_rtx_UNSPEC (mode, v, u);
19893 else
19895 code = is_min ? SMIN : SMAX;
19896 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
19899 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
19900 return true;
19903 /* Expand an sse vector comparison. Return the register with the result. */
19905 static rtx
19906 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
19907 rtx op_true, rtx op_false)
19909 enum machine_mode mode = GET_MODE (dest);
19910 enum machine_mode cmp_mode = GET_MODE (cmp_op0);
19911 rtx x;
19913 cmp_op0 = force_reg (cmp_mode, cmp_op0);
19914 if (!nonimmediate_operand (cmp_op1, cmp_mode))
19915 cmp_op1 = force_reg (cmp_mode, cmp_op1);
19917 if (optimize
19918 || reg_overlap_mentioned_p (dest, op_true)
19919 || reg_overlap_mentioned_p (dest, op_false))
19920 dest = gen_reg_rtx (mode);
19922 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
19923 if (cmp_mode != mode)
19925 x = force_reg (cmp_mode, x);
19926 convert_move (dest, x, false);
19928 else
19929 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19931 return dest;
19934 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
19935 operations. This is used for both scalar and vector conditional moves. */
19937 static void
19938 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
19940 enum machine_mode mode = GET_MODE (dest);
19941 rtx t2, t3, x;
19943 if (vector_all_ones_operand (op_true, mode)
19944 && rtx_equal_p (op_false, CONST0_RTX (mode)))
19946 emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
19948 else if (op_false == CONST0_RTX (mode))
19950 op_true = force_reg (mode, op_true);
19951 x = gen_rtx_AND (mode, cmp, op_true);
19952 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19954 else if (op_true == CONST0_RTX (mode))
19956 op_false = force_reg (mode, op_false);
19957 x = gen_rtx_NOT (mode, cmp);
19958 x = gen_rtx_AND (mode, x, op_false);
19959 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19961 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
19963 op_false = force_reg (mode, op_false);
19964 x = gen_rtx_IOR (mode, cmp, op_false);
19965 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19967 else if (TARGET_XOP)
19969 op_true = force_reg (mode, op_true);
19971 if (!nonimmediate_operand (op_false, mode))
19972 op_false = force_reg (mode, op_false);
19974 emit_insn (gen_rtx_SET (mode, dest,
19975 gen_rtx_IF_THEN_ELSE (mode, cmp,
19976 op_true,
19977 op_false)));
19979 else
19981 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
19983 if (!nonimmediate_operand (op_true, mode))
19984 op_true = force_reg (mode, op_true);
19986 op_false = force_reg (mode, op_false);
19988 switch (mode)
19990 case V4SFmode:
19991 if (TARGET_SSE4_1)
19992 gen = gen_sse4_1_blendvps;
19993 break;
19994 case V2DFmode:
19995 if (TARGET_SSE4_1)
19996 gen = gen_sse4_1_blendvpd;
19997 break;
19998 case V16QImode:
19999 case V8HImode:
20000 case V4SImode:
20001 case V2DImode:
20002 if (TARGET_SSE4_1)
20004 gen = gen_sse4_1_pblendvb;
20005 dest = gen_lowpart (V16QImode, dest);
20006 op_false = gen_lowpart (V16QImode, op_false);
20007 op_true = gen_lowpart (V16QImode, op_true);
20008 cmp = gen_lowpart (V16QImode, cmp);
20010 break;
20011 case V8SFmode:
20012 if (TARGET_AVX)
20013 gen = gen_avx_blendvps256;
20014 break;
20015 case V4DFmode:
20016 if (TARGET_AVX)
20017 gen = gen_avx_blendvpd256;
20018 break;
20019 case V32QImode:
20020 case V16HImode:
20021 case V8SImode:
20022 case V4DImode:
20023 if (TARGET_AVX2)
20025 gen = gen_avx2_pblendvb;
20026 dest = gen_lowpart (V32QImode, dest);
20027 op_false = gen_lowpart (V32QImode, op_false);
20028 op_true = gen_lowpart (V32QImode, op_true);
20029 cmp = gen_lowpart (V32QImode, cmp);
20031 break;
20032 default:
20033 break;
20036 if (gen != NULL)
20037 emit_insn (gen (dest, op_false, op_true, cmp));
20038 else
20040 op_true = force_reg (mode, op_true);
20042 t2 = gen_reg_rtx (mode);
20043 if (optimize)
20044 t3 = gen_reg_rtx (mode);
20045 else
20046 t3 = dest;
20048 x = gen_rtx_AND (mode, op_true, cmp);
20049 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
20051 x = gen_rtx_NOT (mode, cmp);
20052 x = gen_rtx_AND (mode, x, op_false);
20053 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
20055 x = gen_rtx_IOR (mode, t3, t2);
20056 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20061 /* Expand a floating-point conditional move. Return true if successful. */
20063 bool
20064 ix86_expand_fp_movcc (rtx operands[])
20066 enum machine_mode mode = GET_MODE (operands[0]);
20067 enum rtx_code code = GET_CODE (operands[1]);
20068 rtx tmp, compare_op;
20069 rtx op0 = XEXP (operands[1], 0);
20070 rtx op1 = XEXP (operands[1], 1);
20072 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
20074 enum machine_mode cmode;
20076 /* Since we've no cmove for sse registers, don't force bad register
20077 allocation just to gain access to it. Deny movcc when the
20078 comparison mode doesn't match the move mode. */
20079 cmode = GET_MODE (op0);
20080 if (cmode == VOIDmode)
20081 cmode = GET_MODE (op1);
20082 if (cmode != mode)
20083 return false;
20085 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
20086 if (code == UNKNOWN)
20087 return false;
20089 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
20090 operands[2], operands[3]))
20091 return true;
20093 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
20094 operands[2], operands[3]);
20095 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
20096 return true;
20099 if (GET_MODE (op0) == TImode
20100 || (GET_MODE (op0) == DImode
20101 && !TARGET_64BIT))
20102 return false;
20104 /* The floating point conditional move instructions don't directly
20105 support conditions resulting from a signed integer comparison. */
20107 compare_op = ix86_expand_compare (code, op0, op1);
20108 if (!fcmov_comparison_operator (compare_op, VOIDmode))
20110 tmp = gen_reg_rtx (QImode);
20111 ix86_expand_setcc (tmp, code, op0, op1);
20113 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
20116 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
20117 gen_rtx_IF_THEN_ELSE (mode, compare_op,
20118 operands[2], operands[3])));
20120 return true;
20123 /* Expand a floating-point vector conditional move; a vcond operation
20124 rather than a movcc operation. */
20126 bool
20127 ix86_expand_fp_vcond (rtx operands[])
20129 enum rtx_code code = GET_CODE (operands[3]);
20130 rtx cmp;
20132 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
20133 &operands[4], &operands[5]);
20134 if (code == UNKNOWN)
20136 rtx temp;
20137 switch (GET_CODE (operands[3]))
20139 case LTGT:
20140 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
20141 operands[5], operands[0], operands[0]);
20142 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
20143 operands[5], operands[1], operands[2]);
20144 code = AND;
20145 break;
20146 case UNEQ:
20147 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
20148 operands[5], operands[0], operands[0]);
20149 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
20150 operands[5], operands[1], operands[2]);
20151 code = IOR;
20152 break;
20153 default:
20154 gcc_unreachable ();
20156 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
20157 OPTAB_DIRECT);
20158 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20159 return true;
20162 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
20163 operands[5], operands[1], operands[2]))
20164 return true;
20166 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
20167 operands[1], operands[2]);
20168 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20169 return true;
20172 /* Expand a signed/unsigned integral vector conditional move. */
20174 bool
20175 ix86_expand_int_vcond (rtx operands[])
20177 enum machine_mode data_mode = GET_MODE (operands[0]);
20178 enum machine_mode mode = GET_MODE (operands[4]);
20179 enum rtx_code code = GET_CODE (operands[3]);
20180 bool negate = false;
20181 rtx x, cop0, cop1;
20183 cop0 = operands[4];
20184 cop1 = operands[5];
20186 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
20187 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
20188 if ((code == LT || code == GE)
20189 && data_mode == mode
20190 && cop1 == CONST0_RTX (mode)
20191 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
20192 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
20193 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
20194 && (GET_MODE_SIZE (data_mode) == 16
20195 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
20197 rtx negop = operands[2 - (code == LT)];
20198 int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
20199 if (negop == CONST1_RTX (data_mode))
20201 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
20202 operands[0], 1, OPTAB_DIRECT);
20203 if (res != operands[0])
20204 emit_move_insn (operands[0], res);
20205 return true;
20207 else if (GET_MODE_INNER (data_mode) != DImode
20208 && vector_all_ones_operand (negop, data_mode))
20210 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
20211 operands[0], 0, OPTAB_DIRECT);
20212 if (res != operands[0])
20213 emit_move_insn (operands[0], res);
20214 return true;
20218 if (!nonimmediate_operand (cop1, mode))
20219 cop1 = force_reg (mode, cop1);
20220 if (!general_operand (operands[1], data_mode))
20221 operands[1] = force_reg (data_mode, operands[1]);
20222 if (!general_operand (operands[2], data_mode))
20223 operands[2] = force_reg (data_mode, operands[2]);
20225 /* XOP supports all of the comparisons on all 128-bit vector int types. */
20226 if (TARGET_XOP
20227 && (mode == V16QImode || mode == V8HImode
20228 || mode == V4SImode || mode == V2DImode))
20230 else
20232 /* Canonicalize the comparison to EQ, GT, GTU. */
20233 switch (code)
20235 case EQ:
20236 case GT:
20237 case GTU:
20238 break;
20240 case NE:
20241 case LE:
20242 case LEU:
20243 code = reverse_condition (code);
20244 negate = true;
20245 break;
20247 case GE:
20248 case GEU:
20249 code = reverse_condition (code);
20250 negate = true;
20251 /* FALLTHRU */
20253 case LT:
20254 case LTU:
20255 code = swap_condition (code);
20256 x = cop0, cop0 = cop1, cop1 = x;
20257 break;
20259 default:
20260 gcc_unreachable ();
20263 /* Only SSE4.1/SSE4.2 supports V2DImode. */
20264 if (mode == V2DImode)
20266 switch (code)
20268 case EQ:
20269 /* SSE4.1 supports EQ. */
20270 if (!TARGET_SSE4_1)
20271 return false;
20272 break;
20274 case GT:
20275 case GTU:
20276 /* SSE4.2 supports GT/GTU. */
20277 if (!TARGET_SSE4_2)
20278 return false;
20279 break;
20281 default:
20282 gcc_unreachable ();
20286 /* Unsigned parallel compare is not supported by the hardware.
20287 Play some tricks to turn this into a signed comparison
20288 against 0. */
20289 if (code == GTU)
20291 cop0 = force_reg (mode, cop0);
20293 switch (mode)
20295 case V8SImode:
20296 case V4DImode:
20297 case V4SImode:
20298 case V2DImode:
20300 rtx t1, t2, mask;
20301 rtx (*gen_sub3) (rtx, rtx, rtx);
20303 switch (mode)
20305 case V8SImode: gen_sub3 = gen_subv8si3; break;
20306 case V4DImode: gen_sub3 = gen_subv4di3; break;
20307 case V4SImode: gen_sub3 = gen_subv4si3; break;
20308 case V2DImode: gen_sub3 = gen_subv2di3; break;
20309 default:
20310 gcc_unreachable ();
20312 /* Subtract (-(INT MAX) - 1) from both operands to make
20313 them signed. */
20314 mask = ix86_build_signbit_mask (mode, true, false);
20315 t1 = gen_reg_rtx (mode);
20316 emit_insn (gen_sub3 (t1, cop0, mask));
20318 t2 = gen_reg_rtx (mode);
20319 emit_insn (gen_sub3 (t2, cop1, mask));
20321 cop0 = t1;
20322 cop1 = t2;
20323 code = GT;
20325 break;
20327 case V32QImode:
20328 case V16HImode:
20329 case V16QImode:
20330 case V8HImode:
20331 /* Perform a parallel unsigned saturating subtraction. */
20332 x = gen_reg_rtx (mode);
20333 emit_insn (gen_rtx_SET (VOIDmode, x,
20334 gen_rtx_US_MINUS (mode, cop0, cop1)));
20336 cop0 = x;
20337 cop1 = CONST0_RTX (mode);
20338 code = EQ;
20339 negate = !negate;
20340 break;
20342 default:
20343 gcc_unreachable ();
20348 /* Allow the comparison to be done in one mode, but the movcc to
20349 happen in another mode. */
20350 if (data_mode == mode)
20352 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
20353 operands[1+negate], operands[2-negate]);
20355 else
20357 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
20358 x = ix86_expand_sse_cmp (gen_lowpart (mode, operands[0]),
20359 code, cop0, cop1,
20360 operands[1+negate], operands[2-negate]);
20361 x = gen_lowpart (data_mode, x);
20364 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
20365 operands[2-negate]);
20366 return true;
20369 /* Expand a variable vector permutation. */
20371 void
20372 ix86_expand_vec_perm (rtx operands[])
20374 rtx target = operands[0];
20375 rtx op0 = operands[1];
20376 rtx op1 = operands[2];
20377 rtx mask = operands[3];
20378 rtx t1, t2, t3, t4, vt, vt2, vec[32];
20379 enum machine_mode mode = GET_MODE (op0);
20380 enum machine_mode maskmode = GET_MODE (mask);
20381 int w, e, i;
20382 bool one_operand_shuffle = rtx_equal_p (op0, op1);
20384 /* Number of elements in the vector. */
20385 w = GET_MODE_NUNITS (mode);
20386 e = GET_MODE_UNIT_SIZE (mode);
20387 gcc_assert (w <= 32);
20389 if (TARGET_AVX2)
20391 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
20393 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
20394 an constant shuffle operand. With a tiny bit of effort we can
20395 use VPERMD instead. A re-interpretation stall for V4DFmode is
20396 unfortunate but there's no avoiding it.
20397 Similarly for V16HImode we don't have instructions for variable
20398 shuffling, while for V32QImode we can use after preparing suitable
20399 masks vpshufb; vpshufb; vpermq; vpor. */
20401 if (mode == V16HImode)
20403 maskmode = mode = V32QImode;
20404 w = 32;
20405 e = 1;
20407 else
20409 maskmode = mode = V8SImode;
20410 w = 8;
20411 e = 4;
20413 t1 = gen_reg_rtx (maskmode);
20415 /* Replicate the low bits of the V4DImode mask into V8SImode:
20416 mask = { A B C D }
20417 t1 = { A A B B C C D D }. */
20418 for (i = 0; i < w / 2; ++i)
20419 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
20420 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20421 vt = force_reg (maskmode, vt);
20422 mask = gen_lowpart (maskmode, mask);
20423 if (maskmode == V8SImode)
20424 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
20425 else
20426 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
20428 /* Multiply the shuffle indicies by two. */
20429 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
20430 OPTAB_DIRECT);
20432 /* Add one to the odd shuffle indicies:
20433 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
20434 for (i = 0; i < w / 2; ++i)
20436 vec[i * 2] = const0_rtx;
20437 vec[i * 2 + 1] = const1_rtx;
20439 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20440 vt = force_const_mem (maskmode, vt);
20441 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
20442 OPTAB_DIRECT);
20444 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
20445 operands[3] = mask = t1;
20446 target = gen_lowpart (mode, target);
20447 op0 = gen_lowpart (mode, op0);
20448 op1 = gen_lowpart (mode, op1);
20451 switch (mode)
20453 case V8SImode:
20454 /* The VPERMD and VPERMPS instructions already properly ignore
20455 the high bits of the shuffle elements. No need for us to
20456 perform an AND ourselves. */
20457 if (one_operand_shuffle)
20458 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
20459 else
20461 t1 = gen_reg_rtx (V8SImode);
20462 t2 = gen_reg_rtx (V8SImode);
20463 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
20464 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
20465 goto merge_two;
20467 return;
20469 case V8SFmode:
20470 mask = gen_lowpart (V8SFmode, mask);
20471 if (one_operand_shuffle)
20472 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
20473 else
20475 t1 = gen_reg_rtx (V8SFmode);
20476 t2 = gen_reg_rtx (V8SFmode);
20477 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
20478 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
20479 goto merge_two;
20481 return;
20483 case V4SImode:
20484 /* By combining the two 128-bit input vectors into one 256-bit
20485 input vector, we can use VPERMD and VPERMPS for the full
20486 two-operand shuffle. */
20487 t1 = gen_reg_rtx (V8SImode);
20488 t2 = gen_reg_rtx (V8SImode);
20489 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
20490 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
20491 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
20492 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
20493 return;
20495 case V4SFmode:
20496 t1 = gen_reg_rtx (V8SFmode);
20497 t2 = gen_reg_rtx (V8SImode);
20498 mask = gen_lowpart (V4SImode, mask);
20499 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
20500 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
20501 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
20502 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
20503 return;
20505 case V32QImode:
20506 t1 = gen_reg_rtx (V32QImode);
20507 t2 = gen_reg_rtx (V32QImode);
20508 t3 = gen_reg_rtx (V32QImode);
20509 vt2 = GEN_INT (128);
20510 for (i = 0; i < 32; i++)
20511 vec[i] = vt2;
20512 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20513 vt = force_reg (V32QImode, vt);
20514 for (i = 0; i < 32; i++)
20515 vec[i] = i < 16 ? vt2 : const0_rtx;
20516 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20517 vt2 = force_reg (V32QImode, vt2);
20518 /* From mask create two adjusted masks, which contain the same
20519 bits as mask in the low 7 bits of each vector element.
20520 The first mask will have the most significant bit clear
20521 if it requests element from the same 128-bit lane
20522 and MSB set if it requests element from the other 128-bit lane.
20523 The second mask will have the opposite values of the MSB,
20524 and additionally will have its 128-bit lanes swapped.
20525 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
20526 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
20527 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
20528 stands for other 12 bytes. */
20529 /* The bit whether element is from the same lane or the other
20530 lane is bit 4, so shift it up by 3 to the MSB position. */
20531 emit_insn (gen_ashlv4di3 (gen_lowpart (V4DImode, t1),
20532 gen_lowpart (V4DImode, mask),
20533 GEN_INT (3)));
20534 /* Clear MSB bits from the mask just in case it had them set. */
20535 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
20536 /* After this t1 will have MSB set for elements from other lane. */
20537 emit_insn (gen_xorv32qi3 (t1, t1, vt2));
20538 /* Clear bits other than MSB. */
20539 emit_insn (gen_andv32qi3 (t1, t1, vt));
20540 /* Or in the lower bits from mask into t3. */
20541 emit_insn (gen_iorv32qi3 (t3, t1, t2));
20542 /* And invert MSB bits in t1, so MSB is set for elements from the same
20543 lane. */
20544 emit_insn (gen_xorv32qi3 (t1, t1, vt));
20545 /* Swap 128-bit lanes in t3. */
20546 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20547 gen_lowpart (V4DImode, t3),
20548 const2_rtx, GEN_INT (3),
20549 const0_rtx, const1_rtx));
20550 /* And or in the lower bits from mask into t1. */
20551 emit_insn (gen_iorv32qi3 (t1, t1, t2));
20552 if (one_operand_shuffle)
20554 /* Each of these shuffles will put 0s in places where
20555 element from the other 128-bit lane is needed, otherwise
20556 will shuffle in the requested value. */
20557 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3));
20558 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
20559 /* For t3 the 128-bit lanes are swapped again. */
20560 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20561 gen_lowpart (V4DImode, t3),
20562 const2_rtx, GEN_INT (3),
20563 const0_rtx, const1_rtx));
20564 /* And oring both together leads to the result. */
20565 emit_insn (gen_iorv32qi3 (target, t1, t3));
20566 return;
20569 t4 = gen_reg_rtx (V32QImode);
20570 /* Similarly to the above one_operand_shuffle code,
20571 just for repeated twice for each operand. merge_two:
20572 code will merge the two results together. */
20573 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3));
20574 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3));
20575 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
20576 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
20577 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4),
20578 gen_lowpart (V4DImode, t4),
20579 const2_rtx, GEN_INT (3),
20580 const0_rtx, const1_rtx));
20581 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20582 gen_lowpart (V4DImode, t3),
20583 const2_rtx, GEN_INT (3),
20584 const0_rtx, const1_rtx));
20585 emit_insn (gen_iorv32qi3 (t4, t2, t4));
20586 emit_insn (gen_iorv32qi3 (t3, t1, t3));
20587 t1 = t4;
20588 t2 = t3;
20589 goto merge_two;
20591 default:
20592 gcc_assert (GET_MODE_SIZE (mode) <= 16);
20593 break;
20597 if (TARGET_XOP)
20599 /* The XOP VPPERM insn supports three inputs. By ignoring the
20600 one_operand_shuffle special case, we avoid creating another
20601 set of constant vectors in memory. */
20602 one_operand_shuffle = false;
20604 /* mask = mask & {2*w-1, ...} */
20605 vt = GEN_INT (2*w - 1);
20607 else
20609 /* mask = mask & {w-1, ...} */
20610 vt = GEN_INT (w - 1);
20613 for (i = 0; i < w; i++)
20614 vec[i] = vt;
20615 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20616 mask = expand_simple_binop (maskmode, AND, mask, vt,
20617 NULL_RTX, 0, OPTAB_DIRECT);
20619 /* For non-QImode operations, convert the word permutation control
20620 into a byte permutation control. */
20621 if (mode != V16QImode)
20623 mask = expand_simple_binop (maskmode, ASHIFT, mask,
20624 GEN_INT (exact_log2 (e)),
20625 NULL_RTX, 0, OPTAB_DIRECT);
20627 /* Convert mask to vector of chars. */
20628 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
20630 /* Replicate each of the input bytes into byte positions:
20631 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
20632 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
20633 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
20634 for (i = 0; i < 16; ++i)
20635 vec[i] = GEN_INT (i/e * e);
20636 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20637 vt = force_const_mem (V16QImode, vt);
20638 if (TARGET_XOP)
20639 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
20640 else
20641 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
20643 /* Convert it into the byte positions by doing
20644 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
20645 for (i = 0; i < 16; ++i)
20646 vec[i] = GEN_INT (i % e);
20647 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20648 vt = force_const_mem (V16QImode, vt);
20649 emit_insn (gen_addv16qi3 (mask, mask, vt));
20652 /* The actual shuffle operations all operate on V16QImode. */
20653 op0 = gen_lowpart (V16QImode, op0);
20654 op1 = gen_lowpart (V16QImode, op1);
20655 target = gen_lowpart (V16QImode, target);
20657 if (TARGET_XOP)
20659 emit_insn (gen_xop_pperm (target, op0, op1, mask));
20661 else if (one_operand_shuffle)
20663 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
20665 else
20667 rtx xops[6];
20668 bool ok;
20670 /* Shuffle the two input vectors independently. */
20671 t1 = gen_reg_rtx (V16QImode);
20672 t2 = gen_reg_rtx (V16QImode);
20673 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
20674 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
20676 merge_two:
20677 /* Then merge them together. The key is whether any given control
20678 element contained a bit set that indicates the second word. */
20679 mask = operands[3];
20680 vt = GEN_INT (w);
20681 if (maskmode == V2DImode && !TARGET_SSE4_1)
20683 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
20684 more shuffle to convert the V2DI input mask into a V4SI
20685 input mask. At which point the masking that expand_int_vcond
20686 will work as desired. */
20687 rtx t3 = gen_reg_rtx (V4SImode);
20688 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
20689 const0_rtx, const0_rtx,
20690 const2_rtx, const2_rtx));
20691 mask = t3;
20692 maskmode = V4SImode;
20693 e = w = 4;
20696 for (i = 0; i < w; i++)
20697 vec[i] = vt;
20698 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20699 vt = force_reg (maskmode, vt);
20700 mask = expand_simple_binop (maskmode, AND, mask, vt,
20701 NULL_RTX, 0, OPTAB_DIRECT);
20703 xops[0] = gen_lowpart (mode, operands[0]);
20704 xops[1] = gen_lowpart (mode, t2);
20705 xops[2] = gen_lowpart (mode, t1);
20706 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
20707 xops[4] = mask;
20708 xops[5] = vt;
20709 ok = ix86_expand_int_vcond (xops);
20710 gcc_assert (ok);
20714 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
20715 true if we should do zero extension, else sign extension. HIGH_P is
20716 true if we want the N/2 high elements, else the low elements. */
20718 void
20719 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
20721 enum machine_mode imode = GET_MODE (src);
20722 rtx tmp;
20724 if (TARGET_SSE4_1)
20726 rtx (*unpack)(rtx, rtx);
20727 rtx (*extract)(rtx, rtx) = NULL;
20728 enum machine_mode halfmode = BLKmode;
20730 switch (imode)
20732 case V32QImode:
20733 if (unsigned_p)
20734 unpack = gen_avx2_zero_extendv16qiv16hi2;
20735 else
20736 unpack = gen_avx2_sign_extendv16qiv16hi2;
20737 halfmode = V16QImode;
20738 extract
20739 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
20740 break;
20741 case V16HImode:
20742 if (unsigned_p)
20743 unpack = gen_avx2_zero_extendv8hiv8si2;
20744 else
20745 unpack = gen_avx2_sign_extendv8hiv8si2;
20746 halfmode = V8HImode;
20747 extract
20748 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
20749 break;
20750 case V8SImode:
20751 if (unsigned_p)
20752 unpack = gen_avx2_zero_extendv4siv4di2;
20753 else
20754 unpack = gen_avx2_sign_extendv4siv4di2;
20755 halfmode = V4SImode;
20756 extract
20757 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
20758 break;
20759 case V16QImode:
20760 if (unsigned_p)
20761 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
20762 else
20763 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
20764 break;
20765 case V8HImode:
20766 if (unsigned_p)
20767 unpack = gen_sse4_1_zero_extendv4hiv4si2;
20768 else
20769 unpack = gen_sse4_1_sign_extendv4hiv4si2;
20770 break;
20771 case V4SImode:
20772 if (unsigned_p)
20773 unpack = gen_sse4_1_zero_extendv2siv2di2;
20774 else
20775 unpack = gen_sse4_1_sign_extendv2siv2di2;
20776 break;
20777 default:
20778 gcc_unreachable ();
20781 if (GET_MODE_SIZE (imode) == 32)
20783 tmp = gen_reg_rtx (halfmode);
20784 emit_insn (extract (tmp, src));
20786 else if (high_p)
20788 /* Shift higher 8 bytes to lower 8 bytes. */
20789 tmp = gen_reg_rtx (imode);
20790 emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp),
20791 gen_lowpart (V1TImode, src),
20792 GEN_INT (64)));
20794 else
20795 tmp = src;
20797 emit_insn (unpack (dest, tmp));
20799 else
20801 rtx (*unpack)(rtx, rtx, rtx);
20803 switch (imode)
20805 case V16QImode:
20806 if (high_p)
20807 unpack = gen_vec_interleave_highv16qi;
20808 else
20809 unpack = gen_vec_interleave_lowv16qi;
20810 break;
20811 case V8HImode:
20812 if (high_p)
20813 unpack = gen_vec_interleave_highv8hi;
20814 else
20815 unpack = gen_vec_interleave_lowv8hi;
20816 break;
20817 case V4SImode:
20818 if (high_p)
20819 unpack = gen_vec_interleave_highv4si;
20820 else
20821 unpack = gen_vec_interleave_lowv4si;
20822 break;
20823 default:
20824 gcc_unreachable ();
20827 if (unsigned_p)
20828 tmp = force_reg (imode, CONST0_RTX (imode));
20829 else
20830 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
20831 src, pc_rtx, pc_rtx);
20833 emit_insn (unpack (gen_lowpart (imode, dest), src, tmp));
20837 /* Expand conditional increment or decrement using adb/sbb instructions.
20838 The default case using setcc followed by the conditional move can be
20839 done by generic code. */
20840 bool
20841 ix86_expand_int_addcc (rtx operands[])
20843 enum rtx_code code = GET_CODE (operands[1]);
20844 rtx flags;
20845 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
20846 rtx compare_op;
20847 rtx val = const0_rtx;
20848 bool fpcmp = false;
20849 enum machine_mode mode;
20850 rtx op0 = XEXP (operands[1], 0);
20851 rtx op1 = XEXP (operands[1], 1);
20853 if (operands[3] != const1_rtx
20854 && operands[3] != constm1_rtx)
20855 return false;
20856 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
20857 return false;
20858 code = GET_CODE (compare_op);
20860 flags = XEXP (compare_op, 0);
20862 if (GET_MODE (flags) == CCFPmode
20863 || GET_MODE (flags) == CCFPUmode)
20865 fpcmp = true;
20866 code = ix86_fp_compare_code_to_integer (code);
20869 if (code != LTU)
20871 val = constm1_rtx;
20872 if (fpcmp)
20873 PUT_CODE (compare_op,
20874 reverse_condition_maybe_unordered
20875 (GET_CODE (compare_op)));
20876 else
20877 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
20880 mode = GET_MODE (operands[0]);
20882 /* Construct either adc or sbb insn. */
20883 if ((code == LTU) == (operands[3] == constm1_rtx))
20885 switch (mode)
20887 case QImode:
20888 insn = gen_subqi3_carry;
20889 break;
20890 case HImode:
20891 insn = gen_subhi3_carry;
20892 break;
20893 case SImode:
20894 insn = gen_subsi3_carry;
20895 break;
20896 case DImode:
20897 insn = gen_subdi3_carry;
20898 break;
20899 default:
20900 gcc_unreachable ();
20903 else
20905 switch (mode)
20907 case QImode:
20908 insn = gen_addqi3_carry;
20909 break;
20910 case HImode:
20911 insn = gen_addhi3_carry;
20912 break;
20913 case SImode:
20914 insn = gen_addsi3_carry;
20915 break;
20916 case DImode:
20917 insn = gen_adddi3_carry;
20918 break;
20919 default:
20920 gcc_unreachable ();
20923 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
20925 return true;
20929 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
20930 but works for floating pointer parameters and nonoffsetable memories.
20931 For pushes, it returns just stack offsets; the values will be saved
20932 in the right order. Maximally three parts are generated. */
20934 static int
20935 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
20937 int size;
20939 if (!TARGET_64BIT)
20940 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
20941 else
20942 size = (GET_MODE_SIZE (mode) + 4) / 8;
20944 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
20945 gcc_assert (size >= 2 && size <= 4);
20947 /* Optimize constant pool reference to immediates. This is used by fp
20948 moves, that force all constants to memory to allow combining. */
20949 if (MEM_P (operand) && MEM_READONLY_P (operand))
20951 rtx tmp = maybe_get_pool_constant (operand);
20952 if (tmp)
20953 operand = tmp;
20956 if (MEM_P (operand) && !offsettable_memref_p (operand))
20958 /* The only non-offsetable memories we handle are pushes. */
20959 int ok = push_operand (operand, VOIDmode);
20961 gcc_assert (ok);
20963 operand = copy_rtx (operand);
20964 PUT_MODE (operand, word_mode);
20965 parts[0] = parts[1] = parts[2] = parts[3] = operand;
20966 return size;
20969 if (GET_CODE (operand) == CONST_VECTOR)
20971 enum machine_mode imode = int_mode_for_mode (mode);
20972 /* Caution: if we looked through a constant pool memory above,
20973 the operand may actually have a different mode now. That's
20974 ok, since we want to pun this all the way back to an integer. */
20975 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
20976 gcc_assert (operand != NULL);
20977 mode = imode;
20980 if (!TARGET_64BIT)
20982 if (mode == DImode)
20983 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20984 else
20986 int i;
20988 if (REG_P (operand))
20990 gcc_assert (reload_completed);
20991 for (i = 0; i < size; i++)
20992 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
20994 else if (offsettable_memref_p (operand))
20996 operand = adjust_address (operand, SImode, 0);
20997 parts[0] = operand;
20998 for (i = 1; i < size; i++)
20999 parts[i] = adjust_address (operand, SImode, 4 * i);
21001 else if (GET_CODE (operand) == CONST_DOUBLE)
21003 REAL_VALUE_TYPE r;
21004 long l[4];
21006 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
21007 switch (mode)
21009 case TFmode:
21010 real_to_target (l, &r, mode);
21011 parts[3] = gen_int_mode (l[3], SImode);
21012 parts[2] = gen_int_mode (l[2], SImode);
21013 break;
21014 case XFmode:
21015 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
21016 long double may not be 80-bit. */
21017 real_to_target (l, &r, mode);
21018 parts[2] = gen_int_mode (l[2], SImode);
21019 break;
21020 case DFmode:
21021 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
21022 break;
21023 default:
21024 gcc_unreachable ();
21026 parts[1] = gen_int_mode (l[1], SImode);
21027 parts[0] = gen_int_mode (l[0], SImode);
21029 else
21030 gcc_unreachable ();
21033 else
21035 if (mode == TImode)
21036 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
21037 if (mode == XFmode || mode == TFmode)
21039 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
21040 if (REG_P (operand))
21042 gcc_assert (reload_completed);
21043 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
21044 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
21046 else if (offsettable_memref_p (operand))
21048 operand = adjust_address (operand, DImode, 0);
21049 parts[0] = operand;
21050 parts[1] = adjust_address (operand, upper_mode, 8);
21052 else if (GET_CODE (operand) == CONST_DOUBLE)
21054 REAL_VALUE_TYPE r;
21055 long l[4];
21057 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
21058 real_to_target (l, &r, mode);
21060 /* Do not use shift by 32 to avoid warning on 32bit systems. */
21061 if (HOST_BITS_PER_WIDE_INT >= 64)
21062 parts[0]
21063 = gen_int_mode
21064 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
21065 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
21066 DImode);
21067 else
21068 parts[0] = immed_double_const (l[0], l[1], DImode);
21070 if (upper_mode == SImode)
21071 parts[1] = gen_int_mode (l[2], SImode);
21072 else if (HOST_BITS_PER_WIDE_INT >= 64)
21073 parts[1]
21074 = gen_int_mode
21075 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
21076 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
21077 DImode);
21078 else
21079 parts[1] = immed_double_const (l[2], l[3], DImode);
21081 else
21082 gcc_unreachable ();
21086 return size;
21089 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
21090 Return false when normal moves are needed; true when all required
21091 insns have been emitted. Operands 2-4 contain the input values
21092 int the correct order; operands 5-7 contain the output values. */
21094 void
21095 ix86_split_long_move (rtx operands[])
21097 rtx part[2][4];
21098 int nparts, i, j;
21099 int push = 0;
21100 int collisions = 0;
21101 enum machine_mode mode = GET_MODE (operands[0]);
21102 bool collisionparts[4];
21104 /* The DFmode expanders may ask us to move double.
21105 For 64bit target this is single move. By hiding the fact
21106 here we simplify i386.md splitters. */
21107 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
21109 /* Optimize constant pool reference to immediates. This is used by
21110 fp moves, that force all constants to memory to allow combining. */
21112 if (MEM_P (operands[1])
21113 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
21114 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
21115 operands[1] = get_pool_constant (XEXP (operands[1], 0));
21116 if (push_operand (operands[0], VOIDmode))
21118 operands[0] = copy_rtx (operands[0]);
21119 PUT_MODE (operands[0], word_mode);
21121 else
21122 operands[0] = gen_lowpart (DImode, operands[0]);
21123 operands[1] = gen_lowpart (DImode, operands[1]);
21124 emit_move_insn (operands[0], operands[1]);
21125 return;
21128 /* The only non-offsettable memory we handle is push. */
21129 if (push_operand (operands[0], VOIDmode))
21130 push = 1;
21131 else
21132 gcc_assert (!MEM_P (operands[0])
21133 || offsettable_memref_p (operands[0]));
21135 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
21136 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
21138 /* When emitting push, take care for source operands on the stack. */
21139 if (push && MEM_P (operands[1])
21140 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
21142 rtx src_base = XEXP (part[1][nparts - 1], 0);
21144 /* Compensate for the stack decrement by 4. */
21145 if (!TARGET_64BIT && nparts == 3
21146 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
21147 src_base = plus_constant (Pmode, src_base, 4);
21149 /* src_base refers to the stack pointer and is
21150 automatically decreased by emitted push. */
21151 for (i = 0; i < nparts; i++)
21152 part[1][i] = change_address (part[1][i],
21153 GET_MODE (part[1][i]), src_base);
21156 /* We need to do copy in the right order in case an address register
21157 of the source overlaps the destination. */
21158 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
21160 rtx tmp;
21162 for (i = 0; i < nparts; i++)
21164 collisionparts[i]
21165 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
21166 if (collisionparts[i])
21167 collisions++;
21170 /* Collision in the middle part can be handled by reordering. */
21171 if (collisions == 1 && nparts == 3 && collisionparts [1])
21173 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21174 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21176 else if (collisions == 1
21177 && nparts == 4
21178 && (collisionparts [1] || collisionparts [2]))
21180 if (collisionparts [1])
21182 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21183 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21185 else
21187 tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
21188 tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
21192 /* If there are more collisions, we can't handle it by reordering.
21193 Do an lea to the last part and use only one colliding move. */
21194 else if (collisions > 1)
21196 rtx base;
21198 collisions = 1;
21200 base = part[0][nparts - 1];
21202 /* Handle the case when the last part isn't valid for lea.
21203 Happens in 64-bit mode storing the 12-byte XFmode. */
21204 if (GET_MODE (base) != Pmode)
21205 base = gen_rtx_REG (Pmode, REGNO (base));
21207 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
21208 part[1][0] = replace_equiv_address (part[1][0], base);
21209 for (i = 1; i < nparts; i++)
21211 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
21212 part[1][i] = replace_equiv_address (part[1][i], tmp);
21217 if (push)
21219 if (!TARGET_64BIT)
21221 if (nparts == 3)
21223 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
21224 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
21225 stack_pointer_rtx, GEN_INT (-4)));
21226 emit_move_insn (part[0][2], part[1][2]);
21228 else if (nparts == 4)
21230 emit_move_insn (part[0][3], part[1][3]);
21231 emit_move_insn (part[0][2], part[1][2]);
21234 else
21236 /* In 64bit mode we don't have 32bit push available. In case this is
21237 register, it is OK - we will just use larger counterpart. We also
21238 retype memory - these comes from attempt to avoid REX prefix on
21239 moving of second half of TFmode value. */
21240 if (GET_MODE (part[1][1]) == SImode)
21242 switch (GET_CODE (part[1][1]))
21244 case MEM:
21245 part[1][1] = adjust_address (part[1][1], DImode, 0);
21246 break;
21248 case REG:
21249 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
21250 break;
21252 default:
21253 gcc_unreachable ();
21256 if (GET_MODE (part[1][0]) == SImode)
21257 part[1][0] = part[1][1];
21260 emit_move_insn (part[0][1], part[1][1]);
21261 emit_move_insn (part[0][0], part[1][0]);
21262 return;
21265 /* Choose correct order to not overwrite the source before it is copied. */
21266 if ((REG_P (part[0][0])
21267 && REG_P (part[1][1])
21268 && (REGNO (part[0][0]) == REGNO (part[1][1])
21269 || (nparts == 3
21270 && REGNO (part[0][0]) == REGNO (part[1][2]))
21271 || (nparts == 4
21272 && REGNO (part[0][0]) == REGNO (part[1][3]))))
21273 || (collisions > 0
21274 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
21276 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
21278 operands[2 + i] = part[0][j];
21279 operands[6 + i] = part[1][j];
21282 else
21284 for (i = 0; i < nparts; i++)
21286 operands[2 + i] = part[0][i];
21287 operands[6 + i] = part[1][i];
21291 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
21292 if (optimize_insn_for_size_p ())
21294 for (j = 0; j < nparts - 1; j++)
21295 if (CONST_INT_P (operands[6 + j])
21296 && operands[6 + j] != const0_rtx
21297 && REG_P (operands[2 + j]))
21298 for (i = j; i < nparts - 1; i++)
21299 if (CONST_INT_P (operands[7 + i])
21300 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
21301 operands[7 + i] = operands[2 + j];
21304 for (i = 0; i < nparts; i++)
21305 emit_move_insn (operands[2 + i], operands[6 + i]);
21307 return;
21310 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
21311 left shift by a constant, either using a single shift or
21312 a sequence of add instructions. */
21314 static void
21315 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
21317 rtx (*insn)(rtx, rtx, rtx);
21319 if (count == 1
21320 || (count * ix86_cost->add <= ix86_cost->shift_const
21321 && !optimize_insn_for_size_p ()))
21323 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
21324 while (count-- > 0)
21325 emit_insn (insn (operand, operand, operand));
21327 else
21329 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21330 emit_insn (insn (operand, operand, GEN_INT (count)));
21334 void
21335 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
21337 rtx (*gen_ashl3)(rtx, rtx, rtx);
21338 rtx (*gen_shld)(rtx, rtx, rtx);
21339 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21341 rtx low[2], high[2];
21342 int count;
21344 if (CONST_INT_P (operands[2]))
21346 split_double_mode (mode, operands, 2, low, high);
21347 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21349 if (count >= half_width)
21351 emit_move_insn (high[0], low[1]);
21352 emit_move_insn (low[0], const0_rtx);
21354 if (count > half_width)
21355 ix86_expand_ashl_const (high[0], count - half_width, mode);
21357 else
21359 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21361 if (!rtx_equal_p (operands[0], operands[1]))
21362 emit_move_insn (operands[0], operands[1]);
21364 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
21365 ix86_expand_ashl_const (low[0], count, mode);
21367 return;
21370 split_double_mode (mode, operands, 1, low, high);
21372 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21374 if (operands[1] == const1_rtx)
21376 /* Assuming we've chosen a QImode capable registers, then 1 << N
21377 can be done with two 32/64-bit shifts, no branches, no cmoves. */
21378 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
21380 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
21382 ix86_expand_clear (low[0]);
21383 ix86_expand_clear (high[0]);
21384 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
21386 d = gen_lowpart (QImode, low[0]);
21387 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21388 s = gen_rtx_EQ (QImode, flags, const0_rtx);
21389 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21391 d = gen_lowpart (QImode, high[0]);
21392 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21393 s = gen_rtx_NE (QImode, flags, const0_rtx);
21394 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21397 /* Otherwise, we can get the same results by manually performing
21398 a bit extract operation on bit 5/6, and then performing the two
21399 shifts. The two methods of getting 0/1 into low/high are exactly
21400 the same size. Avoiding the shift in the bit extract case helps
21401 pentium4 a bit; no one else seems to care much either way. */
21402 else
21404 enum machine_mode half_mode;
21405 rtx (*gen_lshr3)(rtx, rtx, rtx);
21406 rtx (*gen_and3)(rtx, rtx, rtx);
21407 rtx (*gen_xor3)(rtx, rtx, rtx);
21408 HOST_WIDE_INT bits;
21409 rtx x;
21411 if (mode == DImode)
21413 half_mode = SImode;
21414 gen_lshr3 = gen_lshrsi3;
21415 gen_and3 = gen_andsi3;
21416 gen_xor3 = gen_xorsi3;
21417 bits = 5;
21419 else
21421 half_mode = DImode;
21422 gen_lshr3 = gen_lshrdi3;
21423 gen_and3 = gen_anddi3;
21424 gen_xor3 = gen_xordi3;
21425 bits = 6;
21428 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
21429 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
21430 else
21431 x = gen_lowpart (half_mode, operands[2]);
21432 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
21434 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
21435 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
21436 emit_move_insn (low[0], high[0]);
21437 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
21440 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21441 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
21442 return;
21445 if (operands[1] == constm1_rtx)
21447 /* For -1 << N, we can avoid the shld instruction, because we
21448 know that we're shifting 0...31/63 ones into a -1. */
21449 emit_move_insn (low[0], constm1_rtx);
21450 if (optimize_insn_for_size_p ())
21451 emit_move_insn (high[0], low[0]);
21452 else
21453 emit_move_insn (high[0], constm1_rtx);
21455 else
21457 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21459 if (!rtx_equal_p (operands[0], operands[1]))
21460 emit_move_insn (operands[0], operands[1]);
21462 split_double_mode (mode, operands, 1, low, high);
21463 emit_insn (gen_shld (high[0], low[0], operands[2]));
21466 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21468 if (TARGET_CMOVE && scratch)
21470 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21471 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21473 ix86_expand_clear (scratch);
21474 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
21476 else
21478 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21479 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21481 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
21485 void
21486 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
21488 rtx (*gen_ashr3)(rtx, rtx, rtx)
21489 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
21490 rtx (*gen_shrd)(rtx, rtx, rtx);
21491 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21493 rtx low[2], high[2];
21494 int count;
21496 if (CONST_INT_P (operands[2]))
21498 split_double_mode (mode, operands, 2, low, high);
21499 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21501 if (count == GET_MODE_BITSIZE (mode) - 1)
21503 emit_move_insn (high[0], high[1]);
21504 emit_insn (gen_ashr3 (high[0], high[0],
21505 GEN_INT (half_width - 1)));
21506 emit_move_insn (low[0], high[0]);
21509 else if (count >= half_width)
21511 emit_move_insn (low[0], high[1]);
21512 emit_move_insn (high[0], low[0]);
21513 emit_insn (gen_ashr3 (high[0], high[0],
21514 GEN_INT (half_width - 1)));
21516 if (count > half_width)
21517 emit_insn (gen_ashr3 (low[0], low[0],
21518 GEN_INT (count - half_width)));
21520 else
21522 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21524 if (!rtx_equal_p (operands[0], operands[1]))
21525 emit_move_insn (operands[0], operands[1]);
21527 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21528 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
21531 else
21533 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21535 if (!rtx_equal_p (operands[0], operands[1]))
21536 emit_move_insn (operands[0], operands[1]);
21538 split_double_mode (mode, operands, 1, low, high);
21540 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21541 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
21543 if (TARGET_CMOVE && scratch)
21545 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21546 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21548 emit_move_insn (scratch, high[0]);
21549 emit_insn (gen_ashr3 (scratch, scratch,
21550 GEN_INT (half_width - 1)));
21551 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21552 scratch));
21554 else
21556 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
21557 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
21559 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
21564 void
21565 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
21567 rtx (*gen_lshr3)(rtx, rtx, rtx)
21568 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
21569 rtx (*gen_shrd)(rtx, rtx, rtx);
21570 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21572 rtx low[2], high[2];
21573 int count;
21575 if (CONST_INT_P (operands[2]))
21577 split_double_mode (mode, operands, 2, low, high);
21578 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21580 if (count >= half_width)
21582 emit_move_insn (low[0], high[1]);
21583 ix86_expand_clear (high[0]);
21585 if (count > half_width)
21586 emit_insn (gen_lshr3 (low[0], low[0],
21587 GEN_INT (count - half_width)));
21589 else
21591 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21593 if (!rtx_equal_p (operands[0], operands[1]))
21594 emit_move_insn (operands[0], operands[1]);
21596 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21597 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
21600 else
21602 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21604 if (!rtx_equal_p (operands[0], operands[1]))
21605 emit_move_insn (operands[0], operands[1]);
21607 split_double_mode (mode, operands, 1, low, high);
21609 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21610 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
21612 if (TARGET_CMOVE && scratch)
21614 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21615 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21617 ix86_expand_clear (scratch);
21618 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21619 scratch));
21621 else
21623 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21624 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21626 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
21631 /* Predict just emitted jump instruction to be taken with probability PROB. */
21632 static void
21633 predict_jump (int prob)
21635 rtx insn = get_last_insn ();
21636 gcc_assert (JUMP_P (insn));
21637 add_reg_note (insn, REG_BR_PROB, GEN_INT (prob));
21640 /* Helper function for the string operations below. Dest VARIABLE whether
21641 it is aligned to VALUE bytes. If true, jump to the label. */
21642 static rtx
21643 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
21645 rtx label = gen_label_rtx ();
21646 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
21647 if (GET_MODE (variable) == DImode)
21648 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
21649 else
21650 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
21651 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
21652 1, label);
21653 if (epilogue)
21654 predict_jump (REG_BR_PROB_BASE * 50 / 100);
21655 else
21656 predict_jump (REG_BR_PROB_BASE * 90 / 100);
21657 return label;
21660 /* Adjust COUNTER by the VALUE. */
21661 static void
21662 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
21664 rtx (*gen_add)(rtx, rtx, rtx)
21665 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
21667 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
21670 /* Zero extend possibly SImode EXP to Pmode register. */
21672 ix86_zero_extend_to_Pmode (rtx exp)
21674 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
21677 /* Divide COUNTREG by SCALE. */
21678 static rtx
21679 scale_counter (rtx countreg, int scale)
21681 rtx sc;
21683 if (scale == 1)
21684 return countreg;
21685 if (CONST_INT_P (countreg))
21686 return GEN_INT (INTVAL (countreg) / scale);
21687 gcc_assert (REG_P (countreg));
21689 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
21690 GEN_INT (exact_log2 (scale)),
21691 NULL, 1, OPTAB_DIRECT);
21692 return sc;
21695 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
21696 DImode for constant loop counts. */
21698 static enum machine_mode
21699 counter_mode (rtx count_exp)
21701 if (GET_MODE (count_exp) != VOIDmode)
21702 return GET_MODE (count_exp);
21703 if (!CONST_INT_P (count_exp))
21704 return Pmode;
21705 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
21706 return DImode;
21707 return SImode;
21710 /* When SRCPTR is non-NULL, output simple loop to move memory
21711 pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
21712 overall size is COUNT specified in bytes. When SRCPTR is NULL, output the
21713 equivalent loop to set memory by VALUE (supposed to be in MODE).
21715 The size is rounded down to whole number of chunk size moved at once.
21716 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
21719 static void
21720 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
21721 rtx destptr, rtx srcptr, rtx value,
21722 rtx count, enum machine_mode mode, int unroll,
21723 int expected_size)
21725 rtx out_label, top_label, iter, tmp;
21726 enum machine_mode iter_mode = counter_mode (count);
21727 rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
21728 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
21729 rtx size;
21730 rtx x_addr;
21731 rtx y_addr;
21732 int i;
21734 top_label = gen_label_rtx ();
21735 out_label = gen_label_rtx ();
21736 iter = gen_reg_rtx (iter_mode);
21738 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
21739 NULL, 1, OPTAB_DIRECT);
21740 /* Those two should combine. */
21741 if (piece_size == const1_rtx)
21743 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
21744 true, out_label);
21745 predict_jump (REG_BR_PROB_BASE * 10 / 100);
21747 emit_move_insn (iter, const0_rtx);
21749 emit_label (top_label);
21751 tmp = convert_modes (Pmode, iter_mode, iter, true);
21752 x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
21753 destmem = change_address (destmem, mode, x_addr);
21755 if (srcmem)
21757 y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
21758 srcmem = change_address (srcmem, mode, y_addr);
21760 /* When unrolling for chips that reorder memory reads and writes,
21761 we can save registers by using single temporary.
21762 Also using 4 temporaries is overkill in 32bit mode. */
21763 if (!TARGET_64BIT && 0)
21765 for (i = 0; i < unroll; i++)
21767 if (i)
21769 destmem =
21770 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21771 srcmem =
21772 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21774 emit_move_insn (destmem, srcmem);
21777 else
21779 rtx tmpreg[4];
21780 gcc_assert (unroll <= 4);
21781 for (i = 0; i < unroll; i++)
21783 tmpreg[i] = gen_reg_rtx (mode);
21784 if (i)
21786 srcmem =
21787 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21789 emit_move_insn (tmpreg[i], srcmem);
21791 for (i = 0; i < unroll; i++)
21793 if (i)
21795 destmem =
21796 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21798 emit_move_insn (destmem, tmpreg[i]);
21802 else
21803 for (i = 0; i < unroll; i++)
21805 if (i)
21806 destmem =
21807 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21808 emit_move_insn (destmem, value);
21811 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
21812 true, OPTAB_LIB_WIDEN);
21813 if (tmp != iter)
21814 emit_move_insn (iter, tmp);
21816 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
21817 true, top_label);
21818 if (expected_size != -1)
21820 expected_size /= GET_MODE_SIZE (mode) * unroll;
21821 if (expected_size == 0)
21822 predict_jump (0);
21823 else if (expected_size > REG_BR_PROB_BASE)
21824 predict_jump (REG_BR_PROB_BASE - 1);
21825 else
21826 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
21828 else
21829 predict_jump (REG_BR_PROB_BASE * 80 / 100);
21830 iter = ix86_zero_extend_to_Pmode (iter);
21831 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
21832 true, OPTAB_LIB_WIDEN);
21833 if (tmp != destptr)
21834 emit_move_insn (destptr, tmp);
21835 if (srcptr)
21837 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
21838 true, OPTAB_LIB_WIDEN);
21839 if (tmp != srcptr)
21840 emit_move_insn (srcptr, tmp);
21842 emit_label (out_label);
21845 /* Output "rep; mov" instruction.
21846 Arguments have same meaning as for previous function */
21847 static void
21848 expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
21849 rtx destptr, rtx srcptr,
21850 rtx count,
21851 enum machine_mode mode)
21853 rtx destexp;
21854 rtx srcexp;
21855 rtx countreg;
21856 HOST_WIDE_INT rounded_count;
21858 /* If the size is known, it is shorter to use rep movs. */
21859 if (mode == QImode && CONST_INT_P (count)
21860 && !(INTVAL (count) & 3))
21861 mode = SImode;
21863 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21864 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21865 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
21866 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
21867 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21868 if (mode != QImode)
21870 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21871 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21872 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21873 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
21874 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21875 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
21877 else
21879 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21880 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
21882 if (CONST_INT_P (count))
21884 rounded_count = (INTVAL (count)
21885 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21886 destmem = shallow_copy_rtx (destmem);
21887 srcmem = shallow_copy_rtx (srcmem);
21888 set_mem_size (destmem, rounded_count);
21889 set_mem_size (srcmem, rounded_count);
21891 else
21893 if (MEM_SIZE_KNOWN_P (destmem))
21894 clear_mem_size (destmem);
21895 if (MEM_SIZE_KNOWN_P (srcmem))
21896 clear_mem_size (srcmem);
21898 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
21899 destexp, srcexp));
21902 /* Output "rep; stos" instruction.
21903 Arguments have same meaning as for previous function */
21904 static void
21905 expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
21906 rtx count, enum machine_mode mode,
21907 rtx orig_value)
21909 rtx destexp;
21910 rtx countreg;
21911 HOST_WIDE_INT rounded_count;
21913 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21914 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21915 value = force_reg (mode, gen_lowpart (mode, value));
21916 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21917 if (mode != QImode)
21919 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21920 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21921 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21923 else
21924 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21925 if (orig_value == const0_rtx && CONST_INT_P (count))
21927 rounded_count = (INTVAL (count)
21928 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21929 destmem = shallow_copy_rtx (destmem);
21930 set_mem_size (destmem, rounded_count);
21932 else if (MEM_SIZE_KNOWN_P (destmem))
21933 clear_mem_size (destmem);
21934 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
21937 static void
21938 emit_strmov (rtx destmem, rtx srcmem,
21939 rtx destptr, rtx srcptr, enum machine_mode mode, int offset)
21941 rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset);
21942 rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
21943 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21946 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
21947 static void
21948 expand_movmem_epilogue (rtx destmem, rtx srcmem,
21949 rtx destptr, rtx srcptr, rtx count, int max_size)
21951 rtx src, dest;
21952 if (CONST_INT_P (count))
21954 HOST_WIDE_INT countval = INTVAL (count);
21955 int offset = 0;
21957 if ((countval & 0x10) && max_size > 16)
21959 if (TARGET_64BIT)
21961 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21962 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
21964 else
21965 gcc_unreachable ();
21966 offset += 16;
21968 if ((countval & 0x08) && max_size > 8)
21970 if (TARGET_64BIT)
21971 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21972 else
21974 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21975 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset + 4);
21977 offset += 8;
21979 if ((countval & 0x04) && max_size > 4)
21981 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21982 offset += 4;
21984 if ((countval & 0x02) && max_size > 2)
21986 emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
21987 offset += 2;
21989 if ((countval & 0x01) && max_size > 1)
21991 emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
21992 offset += 1;
21994 return;
21996 if (max_size > 8)
21998 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
21999 count, 1, OPTAB_DIRECT);
22000 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
22001 count, QImode, 1, 4);
22002 return;
22005 /* When there are stringops, we can cheaply increase dest and src pointers.
22006 Otherwise we save code size by maintaining offset (zero is readily
22007 available from preceding rep operation) and using x86 addressing modes.
22009 if (TARGET_SINGLE_STRINGOP)
22011 if (max_size > 4)
22013 rtx label = ix86_expand_aligntest (count, 4, true);
22014 src = change_address (srcmem, SImode, srcptr);
22015 dest = change_address (destmem, SImode, destptr);
22016 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22017 emit_label (label);
22018 LABEL_NUSES (label) = 1;
22020 if (max_size > 2)
22022 rtx label = ix86_expand_aligntest (count, 2, true);
22023 src = change_address (srcmem, HImode, srcptr);
22024 dest = change_address (destmem, HImode, destptr);
22025 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22026 emit_label (label);
22027 LABEL_NUSES (label) = 1;
22029 if (max_size > 1)
22031 rtx label = ix86_expand_aligntest (count, 1, true);
22032 src = change_address (srcmem, QImode, srcptr);
22033 dest = change_address (destmem, QImode, destptr);
22034 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22035 emit_label (label);
22036 LABEL_NUSES (label) = 1;
22039 else
22041 rtx offset = force_reg (Pmode, const0_rtx);
22042 rtx tmp;
22044 if (max_size > 4)
22046 rtx label = ix86_expand_aligntest (count, 4, true);
22047 src = change_address (srcmem, SImode, srcptr);
22048 dest = change_address (destmem, SImode, destptr);
22049 emit_move_insn (dest, src);
22050 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
22051 true, OPTAB_LIB_WIDEN);
22052 if (tmp != offset)
22053 emit_move_insn (offset, tmp);
22054 emit_label (label);
22055 LABEL_NUSES (label) = 1;
22057 if (max_size > 2)
22059 rtx label = ix86_expand_aligntest (count, 2, true);
22060 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22061 src = change_address (srcmem, HImode, tmp);
22062 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22063 dest = change_address (destmem, HImode, tmp);
22064 emit_move_insn (dest, src);
22065 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
22066 true, OPTAB_LIB_WIDEN);
22067 if (tmp != offset)
22068 emit_move_insn (offset, tmp);
22069 emit_label (label);
22070 LABEL_NUSES (label) = 1;
22072 if (max_size > 1)
22074 rtx label = ix86_expand_aligntest (count, 1, true);
22075 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22076 src = change_address (srcmem, QImode, tmp);
22077 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22078 dest = change_address (destmem, QImode, tmp);
22079 emit_move_insn (dest, src);
22080 emit_label (label);
22081 LABEL_NUSES (label) = 1;
22086 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22087 static void
22088 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
22089 rtx count, int max_size)
22091 count =
22092 expand_simple_binop (counter_mode (count), AND, count,
22093 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
22094 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
22095 gen_lowpart (QImode, value), count, QImode,
22096 1, max_size / 2);
22099 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22100 static void
22101 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size)
22103 rtx dest;
22105 if (CONST_INT_P (count))
22107 HOST_WIDE_INT countval = INTVAL (count);
22108 int offset = 0;
22110 if ((countval & 0x10) && max_size > 16)
22112 if (TARGET_64BIT)
22114 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
22115 emit_insn (gen_strset (destptr, dest, value));
22116 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8);
22117 emit_insn (gen_strset (destptr, dest, value));
22119 else
22120 gcc_unreachable ();
22121 offset += 16;
22123 if ((countval & 0x08) && max_size > 8)
22125 if (TARGET_64BIT)
22127 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
22128 emit_insn (gen_strset (destptr, dest, value));
22130 else
22132 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
22133 emit_insn (gen_strset (destptr, dest, value));
22134 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4);
22135 emit_insn (gen_strset (destptr, dest, value));
22137 offset += 8;
22139 if ((countval & 0x04) && max_size > 4)
22141 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
22142 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
22143 offset += 4;
22145 if ((countval & 0x02) && max_size > 2)
22147 dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset);
22148 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
22149 offset += 2;
22151 if ((countval & 0x01) && max_size > 1)
22153 dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset);
22154 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
22155 offset += 1;
22157 return;
22159 if (max_size > 32)
22161 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
22162 return;
22164 if (max_size > 16)
22166 rtx label = ix86_expand_aligntest (count, 16, true);
22167 if (TARGET_64BIT)
22169 dest = change_address (destmem, DImode, destptr);
22170 emit_insn (gen_strset (destptr, dest, value));
22171 emit_insn (gen_strset (destptr, dest, value));
22173 else
22175 dest = change_address (destmem, SImode, destptr);
22176 emit_insn (gen_strset (destptr, dest, value));
22177 emit_insn (gen_strset (destptr, dest, value));
22178 emit_insn (gen_strset (destptr, dest, value));
22179 emit_insn (gen_strset (destptr, dest, value));
22181 emit_label (label);
22182 LABEL_NUSES (label) = 1;
22184 if (max_size > 8)
22186 rtx label = ix86_expand_aligntest (count, 8, true);
22187 if (TARGET_64BIT)
22189 dest = change_address (destmem, DImode, destptr);
22190 emit_insn (gen_strset (destptr, dest, value));
22192 else
22194 dest = change_address (destmem, SImode, destptr);
22195 emit_insn (gen_strset (destptr, dest, value));
22196 emit_insn (gen_strset (destptr, dest, value));
22198 emit_label (label);
22199 LABEL_NUSES (label) = 1;
22201 if (max_size > 4)
22203 rtx label = ix86_expand_aligntest (count, 4, true);
22204 dest = change_address (destmem, SImode, destptr);
22205 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
22206 emit_label (label);
22207 LABEL_NUSES (label) = 1;
22209 if (max_size > 2)
22211 rtx label = ix86_expand_aligntest (count, 2, true);
22212 dest = change_address (destmem, HImode, destptr);
22213 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
22214 emit_label (label);
22215 LABEL_NUSES (label) = 1;
22217 if (max_size > 1)
22219 rtx label = ix86_expand_aligntest (count, 1, true);
22220 dest = change_address (destmem, QImode, destptr);
22221 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
22222 emit_label (label);
22223 LABEL_NUSES (label) = 1;
22227 /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
22228 DESIRED_ALIGNMENT. */
22229 static void
22230 expand_movmem_prologue (rtx destmem, rtx srcmem,
22231 rtx destptr, rtx srcptr, rtx count,
22232 int align, int desired_alignment)
22234 if (align <= 1 && desired_alignment > 1)
22236 rtx label = ix86_expand_aligntest (destptr, 1, false);
22237 srcmem = change_address (srcmem, QImode, srcptr);
22238 destmem = change_address (destmem, QImode, destptr);
22239 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
22240 ix86_adjust_counter (count, 1);
22241 emit_label (label);
22242 LABEL_NUSES (label) = 1;
22244 if (align <= 2 && desired_alignment > 2)
22246 rtx label = ix86_expand_aligntest (destptr, 2, false);
22247 srcmem = change_address (srcmem, HImode, srcptr);
22248 destmem = change_address (destmem, HImode, destptr);
22249 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
22250 ix86_adjust_counter (count, 2);
22251 emit_label (label);
22252 LABEL_NUSES (label) = 1;
22254 if (align <= 4 && desired_alignment > 4)
22256 rtx label = ix86_expand_aligntest (destptr, 4, false);
22257 srcmem = change_address (srcmem, SImode, srcptr);
22258 destmem = change_address (destmem, SImode, destptr);
22259 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
22260 ix86_adjust_counter (count, 4);
22261 emit_label (label);
22262 LABEL_NUSES (label) = 1;
22264 gcc_assert (desired_alignment <= 8);
22267 /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
22268 ALIGN_BYTES is how many bytes need to be copied. */
22269 static rtx
22270 expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
22271 int desired_align, int align_bytes)
22273 rtx src = *srcp;
22274 rtx orig_dst = dst;
22275 rtx orig_src = src;
22276 int off = 0;
22277 int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
22278 if (src_align_bytes >= 0)
22279 src_align_bytes = desired_align - src_align_bytes;
22280 if (align_bytes & 1)
22282 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
22283 src = adjust_automodify_address_nv (src, QImode, srcreg, 0);
22284 off = 1;
22285 emit_insn (gen_strmov (destreg, dst, srcreg, src));
22287 if (align_bytes & 2)
22289 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
22290 src = adjust_automodify_address_nv (src, HImode, srcreg, off);
22291 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
22292 set_mem_align (dst, 2 * BITS_PER_UNIT);
22293 if (src_align_bytes >= 0
22294 && (src_align_bytes & 1) == (align_bytes & 1)
22295 && MEM_ALIGN (src) < 2 * BITS_PER_UNIT)
22296 set_mem_align (src, 2 * BITS_PER_UNIT);
22297 off = 2;
22298 emit_insn (gen_strmov (destreg, dst, srcreg, src));
22300 if (align_bytes & 4)
22302 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
22303 src = adjust_automodify_address_nv (src, SImode, srcreg, off);
22304 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
22305 set_mem_align (dst, 4 * BITS_PER_UNIT);
22306 if (src_align_bytes >= 0)
22308 unsigned int src_align = 0;
22309 if ((src_align_bytes & 3) == (align_bytes & 3))
22310 src_align = 4;
22311 else if ((src_align_bytes & 1) == (align_bytes & 1))
22312 src_align = 2;
22313 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
22314 set_mem_align (src, src_align * BITS_PER_UNIT);
22316 off = 4;
22317 emit_insn (gen_strmov (destreg, dst, srcreg, src));
22319 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
22320 src = adjust_automodify_address_nv (src, BLKmode, srcreg, off);
22321 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
22322 set_mem_align (dst, desired_align * BITS_PER_UNIT);
22323 if (src_align_bytes >= 0)
22325 unsigned int src_align = 0;
22326 if ((src_align_bytes & 7) == (align_bytes & 7))
22327 src_align = 8;
22328 else if ((src_align_bytes & 3) == (align_bytes & 3))
22329 src_align = 4;
22330 else if ((src_align_bytes & 1) == (align_bytes & 1))
22331 src_align = 2;
22332 if (src_align > (unsigned int) desired_align)
22333 src_align = desired_align;
22334 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
22335 set_mem_align (src, src_align * BITS_PER_UNIT);
22337 if (MEM_SIZE_KNOWN_P (orig_dst))
22338 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
22339 if (MEM_SIZE_KNOWN_P (orig_src))
22340 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
22341 *srcp = src;
22342 return dst;
22345 /* Set enough from DEST to align DEST known to by aligned by ALIGN to
22346 DESIRED_ALIGNMENT. */
22347 static void
22348 expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
22349 int align, int desired_alignment)
22351 if (align <= 1 && desired_alignment > 1)
22353 rtx label = ix86_expand_aligntest (destptr, 1, false);
22354 destmem = change_address (destmem, QImode, destptr);
22355 emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
22356 ix86_adjust_counter (count, 1);
22357 emit_label (label);
22358 LABEL_NUSES (label) = 1;
22360 if (align <= 2 && desired_alignment > 2)
22362 rtx label = ix86_expand_aligntest (destptr, 2, false);
22363 destmem = change_address (destmem, HImode, destptr);
22364 emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
22365 ix86_adjust_counter (count, 2);
22366 emit_label (label);
22367 LABEL_NUSES (label) = 1;
22369 if (align <= 4 && desired_alignment > 4)
22371 rtx label = ix86_expand_aligntest (destptr, 4, false);
22372 destmem = change_address (destmem, SImode, destptr);
22373 emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
22374 ix86_adjust_counter (count, 4);
22375 emit_label (label);
22376 LABEL_NUSES (label) = 1;
22378 gcc_assert (desired_alignment <= 8);
22381 /* Set enough from DST to align DST known to by aligned by ALIGN to
22382 DESIRED_ALIGN. ALIGN_BYTES is how many bytes need to be stored. */
22383 static rtx
22384 expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
22385 int desired_align, int align_bytes)
22387 int off = 0;
22388 rtx orig_dst = dst;
22389 if (align_bytes & 1)
22391 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
22392 off = 1;
22393 emit_insn (gen_strset (destreg, dst,
22394 gen_lowpart (QImode, value)));
22396 if (align_bytes & 2)
22398 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
22399 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
22400 set_mem_align (dst, 2 * BITS_PER_UNIT);
22401 off = 2;
22402 emit_insn (gen_strset (destreg, dst,
22403 gen_lowpart (HImode, value)));
22405 if (align_bytes & 4)
22407 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
22408 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
22409 set_mem_align (dst, 4 * BITS_PER_UNIT);
22410 off = 4;
22411 emit_insn (gen_strset (destreg, dst,
22412 gen_lowpart (SImode, value)));
22414 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
22415 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
22416 set_mem_align (dst, desired_align * BITS_PER_UNIT);
22417 if (MEM_SIZE_KNOWN_P (orig_dst))
22418 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
22419 return dst;
22422 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
22423 static enum stringop_alg
22424 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
22425 int *dynamic_check, bool *noalign)
22427 const struct stringop_algs * algs;
22428 bool optimize_for_speed;
22429 /* Algorithms using the rep prefix want at least edi and ecx;
22430 additionally, memset wants eax and memcpy wants esi. Don't
22431 consider such algorithms if the user has appropriated those
22432 registers for their own purposes. */
22433 bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
22434 || (memset
22435 ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
22436 *noalign = false;
22438 #define ALG_USABLE_P(alg) (rep_prefix_usable \
22439 || (alg != rep_prefix_1_byte \
22440 && alg != rep_prefix_4_byte \
22441 && alg != rep_prefix_8_byte))
22442 const struct processor_costs *cost;
22444 /* Even if the string operation call is cold, we still might spend a lot
22445 of time processing large blocks. */
22446 if (optimize_function_for_size_p (cfun)
22447 || (optimize_insn_for_size_p ()
22448 && expected_size != -1 && expected_size < 256))
22449 optimize_for_speed = false;
22450 else
22451 optimize_for_speed = true;
22453 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
22455 *dynamic_check = -1;
22456 if (memset)
22457 algs = &cost->memset[TARGET_64BIT != 0];
22458 else
22459 algs = &cost->memcpy[TARGET_64BIT != 0];
22460 if (ix86_stringop_alg != no_stringop && ALG_USABLE_P (ix86_stringop_alg))
22461 return ix86_stringop_alg;
22462 /* rep; movq or rep; movl is the smallest variant. */
22463 else if (!optimize_for_speed)
22465 if (!count || (count & 3))
22466 return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
22467 else
22468 return rep_prefix_usable ? rep_prefix_4_byte : loop;
22470 /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
22472 else if (expected_size != -1 && expected_size < 4)
22473 return loop_1_byte;
22474 else if (expected_size != -1)
22476 unsigned int i;
22477 enum stringop_alg alg = libcall;
22478 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22480 /* We get here if the algorithms that were not libcall-based
22481 were rep-prefix based and we are unable to use rep prefixes
22482 based on global register usage. Break out of the loop and
22483 use the heuristic below. */
22484 if (algs->size[i].max == 0)
22485 break;
22486 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
22488 enum stringop_alg candidate = algs->size[i].alg;
22490 if (candidate != libcall && ALG_USABLE_P (candidate))
22491 alg = candidate;
22492 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
22493 last non-libcall inline algorithm. */
22494 if (TARGET_INLINE_ALL_STRINGOPS)
22496 /* When the current size is best to be copied by a libcall,
22497 but we are still forced to inline, run the heuristic below
22498 that will pick code for medium sized blocks. */
22499 if (alg != libcall)
22500 return alg;
22501 break;
22503 else if (ALG_USABLE_P (candidate))
22505 *noalign = algs->size[i].noalign;
22506 return candidate;
22510 gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
22512 /* When asked to inline the call anyway, try to pick meaningful choice.
22513 We look for maximal size of block that is faster to copy by hand and
22514 take blocks of at most of that size guessing that average size will
22515 be roughly half of the block.
22517 If this turns out to be bad, we might simply specify the preferred
22518 choice in ix86_costs. */
22519 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22520 && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size)))
22522 int max = -1;
22523 enum stringop_alg alg;
22524 int i;
22525 bool any_alg_usable_p = true;
22527 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22529 enum stringop_alg candidate = algs->size[i].alg;
22530 any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
22532 if (candidate != libcall && candidate
22533 && ALG_USABLE_P (candidate))
22534 max = algs->size[i].max;
22536 /* If there aren't any usable algorithms, then recursing on
22537 smaller sizes isn't going to find anything. Just return the
22538 simple byte-at-a-time copy loop. */
22539 if (!any_alg_usable_p)
22541 /* Pick something reasonable. */
22542 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22543 *dynamic_check = 128;
22544 return loop_1_byte;
22546 if (max == -1)
22547 max = 4096;
22548 alg = decide_alg (count, max / 2, memset, dynamic_check, noalign);
22549 gcc_assert (*dynamic_check == -1);
22550 gcc_assert (alg != libcall);
22551 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22552 *dynamic_check = max;
22553 return alg;
22555 return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
22556 #undef ALG_USABLE_P
22559 /* Decide on alignment. We know that the operand is already aligned to ALIGN
22560 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
22561 static int
22562 decide_alignment (int align,
22563 enum stringop_alg alg,
22564 int expected_size)
22566 int desired_align = 0;
22567 switch (alg)
22569 case no_stringop:
22570 gcc_unreachable ();
22571 case loop:
22572 case unrolled_loop:
22573 desired_align = GET_MODE_SIZE (Pmode);
22574 break;
22575 case rep_prefix_8_byte:
22576 desired_align = 8;
22577 break;
22578 case rep_prefix_4_byte:
22579 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22580 copying whole cacheline at once. */
22581 if (TARGET_PENTIUMPRO)
22582 desired_align = 8;
22583 else
22584 desired_align = 4;
22585 break;
22586 case rep_prefix_1_byte:
22587 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22588 copying whole cacheline at once. */
22589 if (TARGET_PENTIUMPRO)
22590 desired_align = 8;
22591 else
22592 desired_align = 1;
22593 break;
22594 case loop_1_byte:
22595 desired_align = 1;
22596 break;
22597 case libcall:
22598 return 0;
22601 if (optimize_size)
22602 desired_align = 1;
22603 if (desired_align < align)
22604 desired_align = align;
22605 if (expected_size != -1 && expected_size < 4)
22606 desired_align = align;
22607 return desired_align;
22610 /* Return the smallest power of 2 greater than VAL. */
22611 static int
22612 smallest_pow2_greater_than (int val)
22614 int ret = 1;
22615 while (ret <= val)
22616 ret <<= 1;
22617 return ret;
22620 /* Expand string move (memcpy) operation. Use i386 string operations
22621 when profitable. expand_setmem contains similar code. The code
22622 depends upon architecture, block size and alignment, but always has
22623 the same overall structure:
22625 1) Prologue guard: Conditional that jumps up to epilogues for small
22626 blocks that can be handled by epilogue alone. This is faster
22627 but also needed for correctness, since prologue assume the block
22628 is larger than the desired alignment.
22630 Optional dynamic check for size and libcall for large
22631 blocks is emitted here too, with -minline-stringops-dynamically.
22633 2) Prologue: copy first few bytes in order to get destination
22634 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
22635 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
22636 copied. We emit either a jump tree on power of two sized
22637 blocks, or a byte loop.
22639 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
22640 with specified algorithm.
22642 4) Epilogue: code copying tail of the block that is too small to be
22643 handled by main body (or up to size guarded by prologue guard). */
22645 bool
22646 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
22647 rtx expected_align_exp, rtx expected_size_exp)
22649 rtx destreg;
22650 rtx srcreg;
22651 rtx label = NULL;
22652 rtx tmp;
22653 rtx jump_around_label = NULL;
22654 HOST_WIDE_INT align = 1;
22655 unsigned HOST_WIDE_INT count = 0;
22656 HOST_WIDE_INT expected_size = -1;
22657 int size_needed = 0, epilogue_size_needed;
22658 int desired_align = 0, align_bytes = 0;
22659 enum stringop_alg alg;
22660 int dynamic_check;
22661 bool need_zero_guard = false;
22662 bool noalign;
22664 if (CONST_INT_P (align_exp))
22665 align = INTVAL (align_exp);
22666 /* i386 can do misaligned access on reasonably increased cost. */
22667 if (CONST_INT_P (expected_align_exp)
22668 && INTVAL (expected_align_exp) > align)
22669 align = INTVAL (expected_align_exp);
22670 /* ALIGN is the minimum of destination and source alignment, but we care here
22671 just about destination alignment. */
22672 else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
22673 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
22675 if (CONST_INT_P (count_exp))
22676 count = expected_size = INTVAL (count_exp);
22677 if (CONST_INT_P (expected_size_exp) && count == 0)
22678 expected_size = INTVAL (expected_size_exp);
22680 /* Make sure we don't need to care about overflow later on. */
22681 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22682 return false;
22684 /* Step 0: Decide on preferred algorithm, desired alignment and
22685 size of chunks to be copied by main loop. */
22687 alg = decide_alg (count, expected_size, false, &dynamic_check, &noalign);
22688 desired_align = decide_alignment (align, alg, expected_size);
22690 if (!TARGET_ALIGN_STRINGOPS || noalign)
22691 align = desired_align;
22693 if (alg == libcall)
22694 return false;
22695 gcc_assert (alg != no_stringop);
22696 if (!count)
22697 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
22698 destreg = copy_addr_to_reg (XEXP (dst, 0));
22699 srcreg = copy_addr_to_reg (XEXP (src, 0));
22700 switch (alg)
22702 case libcall:
22703 case no_stringop:
22704 gcc_unreachable ();
22705 case loop:
22706 need_zero_guard = true;
22707 size_needed = GET_MODE_SIZE (word_mode);
22708 break;
22709 case unrolled_loop:
22710 need_zero_guard = true;
22711 size_needed = GET_MODE_SIZE (word_mode) * (TARGET_64BIT ? 4 : 2);
22712 break;
22713 case rep_prefix_8_byte:
22714 size_needed = 8;
22715 break;
22716 case rep_prefix_4_byte:
22717 size_needed = 4;
22718 break;
22719 case rep_prefix_1_byte:
22720 size_needed = 1;
22721 break;
22722 case loop_1_byte:
22723 need_zero_guard = true;
22724 size_needed = 1;
22725 break;
22728 epilogue_size_needed = size_needed;
22730 /* Step 1: Prologue guard. */
22732 /* Alignment code needs count to be in register. */
22733 if (CONST_INT_P (count_exp) && desired_align > align)
22735 if (INTVAL (count_exp) > desired_align
22736 && INTVAL (count_exp) > size_needed)
22738 align_bytes
22739 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22740 if (align_bytes <= 0)
22741 align_bytes = 0;
22742 else
22743 align_bytes = desired_align - align_bytes;
22745 if (align_bytes == 0)
22746 count_exp = force_reg (counter_mode (count_exp), count_exp);
22748 gcc_assert (desired_align >= 1 && align >= 1);
22750 /* Ensure that alignment prologue won't copy past end of block. */
22751 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22753 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22754 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
22755 Make sure it is power of 2. */
22756 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22758 if (count)
22760 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22762 /* If main algorithm works on QImode, no epilogue is needed.
22763 For small sizes just don't align anything. */
22764 if (size_needed == 1)
22765 desired_align = align;
22766 else
22767 goto epilogue;
22770 else
22772 label = gen_label_rtx ();
22773 emit_cmp_and_jump_insns (count_exp,
22774 GEN_INT (epilogue_size_needed),
22775 LTU, 0, counter_mode (count_exp), 1, label);
22776 if (expected_size == -1 || expected_size < epilogue_size_needed)
22777 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22778 else
22779 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22783 /* Emit code to decide on runtime whether library call or inline should be
22784 used. */
22785 if (dynamic_check != -1)
22787 if (CONST_INT_P (count_exp))
22789 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
22791 emit_block_move_via_libcall (dst, src, count_exp, false);
22792 count_exp = const0_rtx;
22793 goto epilogue;
22796 else
22798 rtx hot_label = gen_label_rtx ();
22799 jump_around_label = gen_label_rtx ();
22800 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22801 LEU, 0, GET_MODE (count_exp), 1, hot_label);
22802 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22803 emit_block_move_via_libcall (dst, src, count_exp, false);
22804 emit_jump (jump_around_label);
22805 emit_label (hot_label);
22809 /* Step 2: Alignment prologue. */
22811 if (desired_align > align)
22813 if (align_bytes == 0)
22815 /* Except for the first move in epilogue, we no longer know
22816 constant offset in aliasing info. It don't seems to worth
22817 the pain to maintain it for the first move, so throw away
22818 the info early. */
22819 src = change_address (src, BLKmode, srcreg);
22820 dst = change_address (dst, BLKmode, destreg);
22821 expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
22822 desired_align);
22824 else
22826 /* If we know how many bytes need to be stored before dst is
22827 sufficiently aligned, maintain aliasing info accurately. */
22828 dst = expand_constant_movmem_prologue (dst, &src, destreg, srcreg,
22829 desired_align, align_bytes);
22830 count_exp = plus_constant (counter_mode (count_exp),
22831 count_exp, -align_bytes);
22832 count -= align_bytes;
22834 if (need_zero_guard
22835 && (count < (unsigned HOST_WIDE_INT) size_needed
22836 || (align_bytes == 0
22837 && count < ((unsigned HOST_WIDE_INT) size_needed
22838 + desired_align - align))))
22840 /* It is possible that we copied enough so the main loop will not
22841 execute. */
22842 gcc_assert (size_needed > 1);
22843 if (label == NULL_RTX)
22844 label = gen_label_rtx ();
22845 emit_cmp_and_jump_insns (count_exp,
22846 GEN_INT (size_needed),
22847 LTU, 0, counter_mode (count_exp), 1, label);
22848 if (expected_size == -1
22849 || expected_size < (desired_align - align) / 2 + size_needed)
22850 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22851 else
22852 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22855 if (label && size_needed == 1)
22857 emit_label (label);
22858 LABEL_NUSES (label) = 1;
22859 label = NULL;
22860 epilogue_size_needed = 1;
22862 else if (label == NULL_RTX)
22863 epilogue_size_needed = size_needed;
22865 /* Step 3: Main loop. */
22867 switch (alg)
22869 case libcall:
22870 case no_stringop:
22871 gcc_unreachable ();
22872 case loop_1_byte:
22873 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22874 count_exp, QImode, 1, expected_size);
22875 break;
22876 case loop:
22877 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22878 count_exp, word_mode, 1, expected_size);
22879 break;
22880 case unrolled_loop:
22881 /* Unroll only by factor of 2 in 32bit mode, since we don't have enough
22882 registers for 4 temporaries anyway. */
22883 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22884 count_exp, word_mode, TARGET_64BIT ? 4 : 2,
22885 expected_size);
22886 break;
22887 case rep_prefix_8_byte:
22888 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22889 DImode);
22890 break;
22891 case rep_prefix_4_byte:
22892 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22893 SImode);
22894 break;
22895 case rep_prefix_1_byte:
22896 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22897 QImode);
22898 break;
22900 /* Adjust properly the offset of src and dest memory for aliasing. */
22901 if (CONST_INT_P (count_exp))
22903 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
22904 (count / size_needed) * size_needed);
22905 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22906 (count / size_needed) * size_needed);
22908 else
22910 src = change_address (src, BLKmode, srcreg);
22911 dst = change_address (dst, BLKmode, destreg);
22914 /* Step 4: Epilogue to copy the remaining bytes. */
22915 epilogue:
22916 if (label)
22918 /* When the main loop is done, COUNT_EXP might hold original count,
22919 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22920 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22921 bytes. Compensate if needed. */
22923 if (size_needed < epilogue_size_needed)
22925 tmp =
22926 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22927 GEN_INT (size_needed - 1), count_exp, 1,
22928 OPTAB_DIRECT);
22929 if (tmp != count_exp)
22930 emit_move_insn (count_exp, tmp);
22932 emit_label (label);
22933 LABEL_NUSES (label) = 1;
22936 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22937 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
22938 epilogue_size_needed);
22939 if (jump_around_label)
22940 emit_label (jump_around_label);
22941 return true;
22944 /* Helper function for memcpy. For QImode value 0xXY produce
22945 0xXYXYXYXY of wide specified by MODE. This is essentially
22946 a * 0x10101010, but we can do slightly better than
22947 synth_mult by unwinding the sequence by hand on CPUs with
22948 slow multiply. */
22949 static rtx
22950 promote_duplicated_reg (enum machine_mode mode, rtx val)
22952 enum machine_mode valmode = GET_MODE (val);
22953 rtx tmp;
22954 int nops = mode == DImode ? 3 : 2;
22956 gcc_assert (mode == SImode || mode == DImode);
22957 if (val == const0_rtx)
22958 return copy_to_mode_reg (mode, const0_rtx);
22959 if (CONST_INT_P (val))
22961 HOST_WIDE_INT v = INTVAL (val) & 255;
22963 v |= v << 8;
22964 v |= v << 16;
22965 if (mode == DImode)
22966 v |= (v << 16) << 16;
22967 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
22970 if (valmode == VOIDmode)
22971 valmode = QImode;
22972 if (valmode != QImode)
22973 val = gen_lowpart (QImode, val);
22974 if (mode == QImode)
22975 return val;
22976 if (!TARGET_PARTIAL_REG_STALL)
22977 nops--;
22978 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
22979 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
22980 <= (ix86_cost->shift_const + ix86_cost->add) * nops
22981 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
22983 rtx reg = convert_modes (mode, QImode, val, true);
22984 tmp = promote_duplicated_reg (mode, const1_rtx);
22985 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
22986 OPTAB_DIRECT);
22988 else
22990 rtx reg = convert_modes (mode, QImode, val, true);
22992 if (!TARGET_PARTIAL_REG_STALL)
22993 if (mode == SImode)
22994 emit_insn (gen_movsi_insv_1 (reg, reg));
22995 else
22996 emit_insn (gen_movdi_insv_1 (reg, reg));
22997 else
22999 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
23000 NULL, 1, OPTAB_DIRECT);
23001 reg =
23002 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23004 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
23005 NULL, 1, OPTAB_DIRECT);
23006 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23007 if (mode == SImode)
23008 return reg;
23009 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
23010 NULL, 1, OPTAB_DIRECT);
23011 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23012 return reg;
23016 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
23017 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
23018 alignment from ALIGN to DESIRED_ALIGN. */
23019 static rtx
23020 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align)
23022 rtx promoted_val;
23024 if (TARGET_64BIT
23025 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
23026 promoted_val = promote_duplicated_reg (DImode, val);
23027 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
23028 promoted_val = promote_duplicated_reg (SImode, val);
23029 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
23030 promoted_val = promote_duplicated_reg (HImode, val);
23031 else
23032 promoted_val = val;
23034 return promoted_val;
23037 /* Expand string clear operation (bzero). Use i386 string operations when
23038 profitable. See expand_movmem comment for explanation of individual
23039 steps performed. */
23040 bool
23041 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
23042 rtx expected_align_exp, rtx expected_size_exp)
23044 rtx destreg;
23045 rtx label = NULL;
23046 rtx tmp;
23047 rtx jump_around_label = NULL;
23048 HOST_WIDE_INT align = 1;
23049 unsigned HOST_WIDE_INT count = 0;
23050 HOST_WIDE_INT expected_size = -1;
23051 int size_needed = 0, epilogue_size_needed;
23052 int desired_align = 0, align_bytes = 0;
23053 enum stringop_alg alg;
23054 rtx promoted_val = NULL;
23055 bool force_loopy_epilogue = false;
23056 int dynamic_check;
23057 bool need_zero_guard = false;
23058 bool noalign;
23060 if (CONST_INT_P (align_exp))
23061 align = INTVAL (align_exp);
23062 /* i386 can do misaligned access on reasonably increased cost. */
23063 if (CONST_INT_P (expected_align_exp)
23064 && INTVAL (expected_align_exp) > align)
23065 align = INTVAL (expected_align_exp);
23066 if (CONST_INT_P (count_exp))
23067 count = expected_size = INTVAL (count_exp);
23068 if (CONST_INT_P (expected_size_exp) && count == 0)
23069 expected_size = INTVAL (expected_size_exp);
23071 /* Make sure we don't need to care about overflow later on. */
23072 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
23073 return false;
23075 /* Step 0: Decide on preferred algorithm, desired alignment and
23076 size of chunks to be copied by main loop. */
23078 alg = decide_alg (count, expected_size, true, &dynamic_check, &noalign);
23079 desired_align = decide_alignment (align, alg, expected_size);
23081 if (!TARGET_ALIGN_STRINGOPS || noalign)
23082 align = desired_align;
23084 if (alg == libcall)
23085 return false;
23086 gcc_assert (alg != no_stringop);
23087 if (!count)
23088 count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp);
23089 destreg = copy_addr_to_reg (XEXP (dst, 0));
23090 switch (alg)
23092 case libcall:
23093 case no_stringop:
23094 gcc_unreachable ();
23095 case loop:
23096 need_zero_guard = true;
23097 size_needed = GET_MODE_SIZE (word_mode);
23098 break;
23099 case unrolled_loop:
23100 need_zero_guard = true;
23101 size_needed = GET_MODE_SIZE (word_mode) * 4;
23102 break;
23103 case rep_prefix_8_byte:
23104 size_needed = 8;
23105 break;
23106 case rep_prefix_4_byte:
23107 size_needed = 4;
23108 break;
23109 case rep_prefix_1_byte:
23110 size_needed = 1;
23111 break;
23112 case loop_1_byte:
23113 need_zero_guard = true;
23114 size_needed = 1;
23115 break;
23117 epilogue_size_needed = size_needed;
23119 /* Step 1: Prologue guard. */
23121 /* Alignment code needs count to be in register. */
23122 if (CONST_INT_P (count_exp) && desired_align > align)
23124 if (INTVAL (count_exp) > desired_align
23125 && INTVAL (count_exp) > size_needed)
23127 align_bytes
23128 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
23129 if (align_bytes <= 0)
23130 align_bytes = 0;
23131 else
23132 align_bytes = desired_align - align_bytes;
23134 if (align_bytes == 0)
23136 enum machine_mode mode = SImode;
23137 if (TARGET_64BIT && (count & ~0xffffffff))
23138 mode = DImode;
23139 count_exp = force_reg (mode, count_exp);
23142 /* Do the cheap promotion to allow better CSE across the
23143 main loop and epilogue (ie one load of the big constant in the
23144 front of all code. */
23145 if (CONST_INT_P (val_exp))
23146 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
23147 desired_align, align);
23148 /* Ensure that alignment prologue won't copy past end of block. */
23149 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
23151 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
23152 /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
23153 Make sure it is power of 2. */
23154 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
23156 /* To improve performance of small blocks, we jump around the VAL
23157 promoting mode. This mean that if the promoted VAL is not constant,
23158 we might not use it in the epilogue and have to use byte
23159 loop variant. */
23160 if (epilogue_size_needed > 2 && !promoted_val)
23161 force_loopy_epilogue = true;
23162 if (count)
23164 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
23166 /* If main algorithm works on QImode, no epilogue is needed.
23167 For small sizes just don't align anything. */
23168 if (size_needed == 1)
23169 desired_align = align;
23170 else
23171 goto epilogue;
23174 else
23176 label = gen_label_rtx ();
23177 emit_cmp_and_jump_insns (count_exp,
23178 GEN_INT (epilogue_size_needed),
23179 LTU, 0, counter_mode (count_exp), 1, label);
23180 if (expected_size == -1 || expected_size <= epilogue_size_needed)
23181 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23182 else
23183 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23186 if (dynamic_check != -1)
23188 rtx hot_label = gen_label_rtx ();
23189 jump_around_label = gen_label_rtx ();
23190 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
23191 LEU, 0, counter_mode (count_exp), 1, hot_label);
23192 predict_jump (REG_BR_PROB_BASE * 90 / 100);
23193 set_storage_via_libcall (dst, count_exp, val_exp, false);
23194 emit_jump (jump_around_label);
23195 emit_label (hot_label);
23198 /* Step 2: Alignment prologue. */
23200 /* Do the expensive promotion once we branched off the small blocks. */
23201 if (!promoted_val)
23202 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
23203 desired_align, align);
23204 gcc_assert (desired_align >= 1 && align >= 1);
23206 if (desired_align > align)
23208 if (align_bytes == 0)
23210 /* Except for the first move in epilogue, we no longer know
23211 constant offset in aliasing info. It don't seems to worth
23212 the pain to maintain it for the first move, so throw away
23213 the info early. */
23214 dst = change_address (dst, BLKmode, destreg);
23215 expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
23216 desired_align);
23218 else
23220 /* If we know how many bytes need to be stored before dst is
23221 sufficiently aligned, maintain aliasing info accurately. */
23222 dst = expand_constant_setmem_prologue (dst, destreg, promoted_val,
23223 desired_align, align_bytes);
23224 count_exp = plus_constant (counter_mode (count_exp),
23225 count_exp, -align_bytes);
23226 count -= align_bytes;
23228 if (need_zero_guard
23229 && (count < (unsigned HOST_WIDE_INT) size_needed
23230 || (align_bytes == 0
23231 && count < ((unsigned HOST_WIDE_INT) size_needed
23232 + desired_align - align))))
23234 /* It is possible that we copied enough so the main loop will not
23235 execute. */
23236 gcc_assert (size_needed > 1);
23237 if (label == NULL_RTX)
23238 label = gen_label_rtx ();
23239 emit_cmp_and_jump_insns (count_exp,
23240 GEN_INT (size_needed),
23241 LTU, 0, counter_mode (count_exp), 1, label);
23242 if (expected_size == -1
23243 || expected_size < (desired_align - align) / 2 + size_needed)
23244 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23245 else
23246 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23249 if (label && size_needed == 1)
23251 emit_label (label);
23252 LABEL_NUSES (label) = 1;
23253 label = NULL;
23254 promoted_val = val_exp;
23255 epilogue_size_needed = 1;
23257 else if (label == NULL_RTX)
23258 epilogue_size_needed = size_needed;
23260 /* Step 3: Main loop. */
23262 switch (alg)
23264 case libcall:
23265 case no_stringop:
23266 gcc_unreachable ();
23267 case loop_1_byte:
23268 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23269 count_exp, QImode, 1, expected_size);
23270 break;
23271 case loop:
23272 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23273 count_exp, word_mode, 1, expected_size);
23274 break;
23275 case unrolled_loop:
23276 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23277 count_exp, word_mode, 4, expected_size);
23278 break;
23279 case rep_prefix_8_byte:
23280 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23281 DImode, val_exp);
23282 break;
23283 case rep_prefix_4_byte:
23284 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23285 SImode, val_exp);
23286 break;
23287 case rep_prefix_1_byte:
23288 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23289 QImode, val_exp);
23290 break;
23292 /* Adjust properly the offset of src and dest memory for aliasing. */
23293 if (CONST_INT_P (count_exp))
23294 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
23295 (count / size_needed) * size_needed);
23296 else
23297 dst = change_address (dst, BLKmode, destreg);
23299 /* Step 4: Epilogue to copy the remaining bytes. */
23301 if (label)
23303 /* When the main loop is done, COUNT_EXP might hold original count,
23304 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
23305 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
23306 bytes. Compensate if needed. */
23308 if (size_needed < epilogue_size_needed)
23310 tmp =
23311 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
23312 GEN_INT (size_needed - 1), count_exp, 1,
23313 OPTAB_DIRECT);
23314 if (tmp != count_exp)
23315 emit_move_insn (count_exp, tmp);
23317 emit_label (label);
23318 LABEL_NUSES (label) = 1;
23320 epilogue:
23321 if (count_exp != const0_rtx && epilogue_size_needed > 1)
23323 if (force_loopy_epilogue)
23324 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
23325 epilogue_size_needed);
23326 else
23327 expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
23328 epilogue_size_needed);
23330 if (jump_around_label)
23331 emit_label (jump_around_label);
23332 return true;
23335 /* Expand the appropriate insns for doing strlen if not just doing
23336 repnz; scasb
23338 out = result, initialized with the start address
23339 align_rtx = alignment of the address.
23340 scratch = scratch register, initialized with the startaddress when
23341 not aligned, otherwise undefined
23343 This is just the body. It needs the initializations mentioned above and
23344 some address computing at the end. These things are done in i386.md. */
23346 static void
23347 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
23349 int align;
23350 rtx tmp;
23351 rtx align_2_label = NULL_RTX;
23352 rtx align_3_label = NULL_RTX;
23353 rtx align_4_label = gen_label_rtx ();
23354 rtx end_0_label = gen_label_rtx ();
23355 rtx mem;
23356 rtx tmpreg = gen_reg_rtx (SImode);
23357 rtx scratch = gen_reg_rtx (SImode);
23358 rtx cmp;
23360 align = 0;
23361 if (CONST_INT_P (align_rtx))
23362 align = INTVAL (align_rtx);
23364 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
23366 /* Is there a known alignment and is it less than 4? */
23367 if (align < 4)
23369 rtx scratch1 = gen_reg_rtx (Pmode);
23370 emit_move_insn (scratch1, out);
23371 /* Is there a known alignment and is it not 2? */
23372 if (align != 2)
23374 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
23375 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
23377 /* Leave just the 3 lower bits. */
23378 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
23379 NULL_RTX, 0, OPTAB_WIDEN);
23381 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
23382 Pmode, 1, align_4_label);
23383 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
23384 Pmode, 1, align_2_label);
23385 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
23386 Pmode, 1, align_3_label);
23388 else
23390 /* Since the alignment is 2, we have to check 2 or 0 bytes;
23391 check if is aligned to 4 - byte. */
23393 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
23394 NULL_RTX, 0, OPTAB_WIDEN);
23396 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
23397 Pmode, 1, align_4_label);
23400 mem = change_address (src, QImode, out);
23402 /* Now compare the bytes. */
23404 /* Compare the first n unaligned byte on a byte per byte basis. */
23405 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
23406 QImode, 1, end_0_label);
23408 /* Increment the address. */
23409 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23411 /* Not needed with an alignment of 2 */
23412 if (align != 2)
23414 emit_label (align_2_label);
23416 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23417 end_0_label);
23419 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23421 emit_label (align_3_label);
23424 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23425 end_0_label);
23427 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23430 /* Generate loop to check 4 bytes at a time. It is not a good idea to
23431 align this loop. It gives only huge programs, but does not help to
23432 speed up. */
23433 emit_label (align_4_label);
23435 mem = change_address (src, SImode, out);
23436 emit_move_insn (scratch, mem);
23437 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
23439 /* This formula yields a nonzero result iff one of the bytes is zero.
23440 This saves three branches inside loop and many cycles. */
23442 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
23443 emit_insn (gen_one_cmplsi2 (scratch, scratch));
23444 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
23445 emit_insn (gen_andsi3 (tmpreg, tmpreg,
23446 gen_int_mode (0x80808080, SImode)));
23447 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
23448 align_4_label);
23450 if (TARGET_CMOVE)
23452 rtx reg = gen_reg_rtx (SImode);
23453 rtx reg2 = gen_reg_rtx (Pmode);
23454 emit_move_insn (reg, tmpreg);
23455 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
23457 /* If zero is not in the first two bytes, move two bytes forward. */
23458 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23459 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23460 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23461 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
23462 gen_rtx_IF_THEN_ELSE (SImode, tmp,
23463 reg,
23464 tmpreg)));
23465 /* Emit lea manually to avoid clobbering of flags. */
23466 emit_insn (gen_rtx_SET (SImode, reg2,
23467 gen_rtx_PLUS (Pmode, out, const2_rtx)));
23469 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23470 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23471 emit_insn (gen_rtx_SET (VOIDmode, out,
23472 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
23473 reg2,
23474 out)));
23476 else
23478 rtx end_2_label = gen_label_rtx ();
23479 /* Is zero in the first two bytes? */
23481 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23482 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23483 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
23484 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23485 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
23486 pc_rtx);
23487 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
23488 JUMP_LABEL (tmp) = end_2_label;
23490 /* Not in the first two. Move two bytes forward. */
23491 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
23492 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
23494 emit_label (end_2_label);
23498 /* Avoid branch in fixing the byte. */
23499 tmpreg = gen_lowpart (QImode, tmpreg);
23500 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
23501 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
23502 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
23503 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
23505 emit_label (end_0_label);
23508 /* Expand strlen. */
23510 bool
23511 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
23513 rtx addr, scratch1, scratch2, scratch3, scratch4;
23515 /* The generic case of strlen expander is long. Avoid it's
23516 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
23518 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23519 && !TARGET_INLINE_ALL_STRINGOPS
23520 && !optimize_insn_for_size_p ()
23521 && (!CONST_INT_P (align) || INTVAL (align) < 4))
23522 return false;
23524 addr = force_reg (Pmode, XEXP (src, 0));
23525 scratch1 = gen_reg_rtx (Pmode);
23527 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23528 && !optimize_insn_for_size_p ())
23530 /* Well it seems that some optimizer does not combine a call like
23531 foo(strlen(bar), strlen(bar));
23532 when the move and the subtraction is done here. It does calculate
23533 the length just once when these instructions are done inside of
23534 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
23535 often used and I use one fewer register for the lifetime of
23536 output_strlen_unroll() this is better. */
23538 emit_move_insn (out, addr);
23540 ix86_expand_strlensi_unroll_1 (out, src, align);
23542 /* strlensi_unroll_1 returns the address of the zero at the end of
23543 the string, like memchr(), so compute the length by subtracting
23544 the start address. */
23545 emit_insn (ix86_gen_sub3 (out, out, addr));
23547 else
23549 rtx unspec;
23551 /* Can't use this if the user has appropriated eax, ecx, or edi. */
23552 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
23553 return false;
23555 scratch2 = gen_reg_rtx (Pmode);
23556 scratch3 = gen_reg_rtx (Pmode);
23557 scratch4 = force_reg (Pmode, constm1_rtx);
23559 emit_move_insn (scratch3, addr);
23560 eoschar = force_reg (QImode, eoschar);
23562 src = replace_equiv_address_nv (src, scratch3);
23564 /* If .md starts supporting :P, this can be done in .md. */
23565 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
23566 scratch4), UNSPEC_SCAS);
23567 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
23568 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
23569 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
23571 return true;
23574 /* For given symbol (function) construct code to compute address of it's PLT
23575 entry in large x86-64 PIC model. */
23576 static rtx
23577 construct_plt_address (rtx symbol)
23579 rtx tmp, unspec;
23581 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
23582 gcc_assert (ix86_cmodel == CM_LARGE_PIC);
23583 gcc_assert (Pmode == DImode);
23585 tmp = gen_reg_rtx (Pmode);
23586 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
23588 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
23589 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
23590 return tmp;
23594 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
23595 rtx callarg2,
23596 rtx pop, bool sibcall)
23598 /* We need to represent that SI and DI registers are clobbered
23599 by SYSV calls. */
23600 static int clobbered_registers[] = {
23601 XMM6_REG, XMM7_REG, XMM8_REG,
23602 XMM9_REG, XMM10_REG, XMM11_REG,
23603 XMM12_REG, XMM13_REG, XMM14_REG,
23604 XMM15_REG, SI_REG, DI_REG
23606 rtx vec[ARRAY_SIZE (clobbered_registers) + 3];
23607 rtx use = NULL, call;
23608 unsigned int vec_len;
23610 if (pop == const0_rtx)
23611 pop = NULL;
23612 gcc_assert (!TARGET_64BIT || !pop);
23614 if (TARGET_MACHO && !TARGET_64BIT)
23616 #if TARGET_MACHO
23617 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
23618 fnaddr = machopic_indirect_call_target (fnaddr);
23619 #endif
23621 else
23623 /* Static functions and indirect calls don't need the pic register. */
23624 if (flag_pic && (!TARGET_64BIT || ix86_cmodel == CM_LARGE_PIC)
23625 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23626 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
23627 use_reg (&use, pic_offset_table_rtx);
23630 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
23632 rtx al = gen_rtx_REG (QImode, AX_REG);
23633 emit_move_insn (al, callarg2);
23634 use_reg (&use, al);
23637 if (ix86_cmodel == CM_LARGE_PIC
23638 && MEM_P (fnaddr)
23639 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23640 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
23641 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
23642 else if (sibcall
23643 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
23644 : !call_insn_operand (XEXP (fnaddr, 0), word_mode))
23646 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
23647 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
23650 vec_len = 0;
23651 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
23652 if (retval)
23653 call = gen_rtx_SET (VOIDmode, retval, call);
23654 vec[vec_len++] = call;
23656 if (pop)
23658 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
23659 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
23660 vec[vec_len++] = pop;
23663 if (TARGET_64BIT_MS_ABI
23664 && (!callarg2 || INTVAL (callarg2) != -2))
23666 unsigned i;
23668 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
23669 UNSPEC_MS_TO_SYSV_CALL);
23671 for (i = 0; i < ARRAY_SIZE (clobbered_registers); i++)
23672 vec[vec_len++]
23673 = gen_rtx_CLOBBER (VOIDmode,
23674 gen_rtx_REG (SSE_REGNO_P (clobbered_registers[i])
23675 ? TImode : DImode,
23676 clobbered_registers[i]));
23679 if (vec_len > 1)
23680 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
23681 call = emit_call_insn (call);
23682 if (use)
23683 CALL_INSN_FUNCTION_USAGE (call) = use;
23685 return call;
23688 /* Output the assembly for a call instruction. */
23690 const char *
23691 ix86_output_call_insn (rtx insn, rtx call_op)
23693 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
23694 bool seh_nop_p = false;
23695 const char *xasm;
23697 if (SIBLING_CALL_P (insn))
23699 if (direct_p)
23700 xasm = "jmp\t%P0";
23701 /* SEH epilogue detection requires the indirect branch case
23702 to include REX.W. */
23703 else if (TARGET_SEH)
23704 xasm = "rex.W jmp %A0";
23705 else
23706 xasm = "jmp\t%A0";
23708 output_asm_insn (xasm, &call_op);
23709 return "";
23712 /* SEH unwinding can require an extra nop to be emitted in several
23713 circumstances. Determine if we have one of those. */
23714 if (TARGET_SEH)
23716 rtx i;
23718 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
23720 /* If we get to another real insn, we don't need the nop. */
23721 if (INSN_P (i))
23722 break;
23724 /* If we get to the epilogue note, prevent a catch region from
23725 being adjacent to the standard epilogue sequence. If non-
23726 call-exceptions, we'll have done this during epilogue emission. */
23727 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
23728 && !flag_non_call_exceptions
23729 && !can_throw_internal (insn))
23731 seh_nop_p = true;
23732 break;
23736 /* If we didn't find a real insn following the call, prevent the
23737 unwinder from looking into the next function. */
23738 if (i == NULL)
23739 seh_nop_p = true;
23742 if (direct_p)
23743 xasm = "call\t%P0";
23744 else
23745 xasm = "call\t%A0";
23747 output_asm_insn (xasm, &call_op);
23749 if (seh_nop_p)
23750 return "nop";
23752 return "";
23755 /* Clear stack slot assignments remembered from previous functions.
23756 This is called from INIT_EXPANDERS once before RTL is emitted for each
23757 function. */
23759 static struct machine_function *
23760 ix86_init_machine_status (void)
23762 struct machine_function *f;
23764 f = ggc_alloc_cleared_machine_function ();
23765 f->use_fast_prologue_epilogue_nregs = -1;
23766 f->call_abi = ix86_abi;
23768 return f;
23771 /* Return a MEM corresponding to a stack slot with mode MODE.
23772 Allocate a new slot if necessary.
23774 The RTL for a function can have several slots available: N is
23775 which slot to use. */
23778 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
23780 struct stack_local_entry *s;
23782 gcc_assert (n < MAX_386_STACK_LOCALS);
23784 for (s = ix86_stack_locals; s; s = s->next)
23785 if (s->mode == mode && s->n == n)
23786 return validize_mem (copy_rtx (s->rtl));
23788 s = ggc_alloc_stack_local_entry ();
23789 s->n = n;
23790 s->mode = mode;
23791 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
23793 s->next = ix86_stack_locals;
23794 ix86_stack_locals = s;
23795 return validize_mem (s->rtl);
23798 static void
23799 ix86_instantiate_decls (void)
23801 struct stack_local_entry *s;
23803 for (s = ix86_stack_locals; s; s = s->next)
23804 if (s->rtl != NULL_RTX)
23805 instantiate_decl_rtl (s->rtl);
23808 /* Calculate the length of the memory address in the instruction encoding.
23809 Includes addr32 prefix, does not include the one-byte modrm, opcode,
23810 or other prefixes. We never generate addr32 prefix for LEA insn. */
23813 memory_address_length (rtx addr, bool lea)
23815 struct ix86_address parts;
23816 rtx base, index, disp;
23817 int len;
23818 int ok;
23820 if (GET_CODE (addr) == PRE_DEC
23821 || GET_CODE (addr) == POST_INC
23822 || GET_CODE (addr) == PRE_MODIFY
23823 || GET_CODE (addr) == POST_MODIFY)
23824 return 0;
23826 ok = ix86_decompose_address (addr, &parts);
23827 gcc_assert (ok);
23829 len = (parts.seg == SEG_DEFAULT) ? 0 : 1;
23831 /* If this is not LEA instruction, add the length of addr32 prefix. */
23832 if (TARGET_64BIT && !lea
23833 && (SImode_address_operand (addr, VOIDmode)
23834 || (parts.base && GET_MODE (parts.base) == SImode)
23835 || (parts.index && GET_MODE (parts.index) == SImode)))
23836 len++;
23838 base = parts.base;
23839 index = parts.index;
23840 disp = parts.disp;
23842 if (base && GET_CODE (base) == SUBREG)
23843 base = SUBREG_REG (base);
23844 if (index && GET_CODE (index) == SUBREG)
23845 index = SUBREG_REG (index);
23847 gcc_assert (base == NULL_RTX || REG_P (base));
23848 gcc_assert (index == NULL_RTX || REG_P (index));
23850 /* Rule of thumb:
23851 - esp as the base always wants an index,
23852 - ebp as the base always wants a displacement,
23853 - r12 as the base always wants an index,
23854 - r13 as the base always wants a displacement. */
23856 /* Register Indirect. */
23857 if (base && !index && !disp)
23859 /* esp (for its index) and ebp (for its displacement) need
23860 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
23861 code. */
23862 if (base == arg_pointer_rtx
23863 || base == frame_pointer_rtx
23864 || REGNO (base) == SP_REG
23865 || REGNO (base) == BP_REG
23866 || REGNO (base) == R12_REG
23867 || REGNO (base) == R13_REG)
23868 len++;
23871 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
23872 is not disp32, but disp32(%rip), so for disp32
23873 SIB byte is needed, unless print_operand_address
23874 optimizes it into disp32(%rip) or (%rip) is implied
23875 by UNSPEC. */
23876 else if (disp && !base && !index)
23878 len += 4;
23879 if (TARGET_64BIT)
23881 rtx symbol = disp;
23883 if (GET_CODE (disp) == CONST)
23884 symbol = XEXP (disp, 0);
23885 if (GET_CODE (symbol) == PLUS
23886 && CONST_INT_P (XEXP (symbol, 1)))
23887 symbol = XEXP (symbol, 0);
23889 if (GET_CODE (symbol) != LABEL_REF
23890 && (GET_CODE (symbol) != SYMBOL_REF
23891 || SYMBOL_REF_TLS_MODEL (symbol) != 0)
23892 && (GET_CODE (symbol) != UNSPEC
23893 || (XINT (symbol, 1) != UNSPEC_GOTPCREL
23894 && XINT (symbol, 1) != UNSPEC_PCREL
23895 && XINT (symbol, 1) != UNSPEC_GOTNTPOFF)))
23896 len++;
23899 else
23901 /* Find the length of the displacement constant. */
23902 if (disp)
23904 if (base && satisfies_constraint_K (disp))
23905 len += 1;
23906 else
23907 len += 4;
23909 /* ebp always wants a displacement. Similarly r13. */
23910 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
23911 len++;
23913 /* An index requires the two-byte modrm form.... */
23914 if (index
23915 /* ...like esp (or r12), which always wants an index. */
23916 || base == arg_pointer_rtx
23917 || base == frame_pointer_rtx
23918 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
23919 len++;
23922 return len;
23925 /* Compute default value for "length_immediate" attribute. When SHORTFORM
23926 is set, expect that insn have 8bit immediate alternative. */
23928 ix86_attr_length_immediate_default (rtx insn, bool shortform)
23930 int len = 0;
23931 int i;
23932 extract_insn_cached (insn);
23933 for (i = recog_data.n_operands - 1; i >= 0; --i)
23934 if (CONSTANT_P (recog_data.operand[i]))
23936 enum attr_mode mode = get_attr_mode (insn);
23938 gcc_assert (!len);
23939 if (shortform && CONST_INT_P (recog_data.operand[i]))
23941 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
23942 switch (mode)
23944 case MODE_QI:
23945 len = 1;
23946 continue;
23947 case MODE_HI:
23948 ival = trunc_int_for_mode (ival, HImode);
23949 break;
23950 case MODE_SI:
23951 ival = trunc_int_for_mode (ival, SImode);
23952 break;
23953 default:
23954 break;
23956 if (IN_RANGE (ival, -128, 127))
23958 len = 1;
23959 continue;
23962 switch (mode)
23964 case MODE_QI:
23965 len = 1;
23966 break;
23967 case MODE_HI:
23968 len = 2;
23969 break;
23970 case MODE_SI:
23971 len = 4;
23972 break;
23973 /* Immediates for DImode instructions are encoded
23974 as 32bit sign extended values. */
23975 case MODE_DI:
23976 len = 4;
23977 break;
23978 default:
23979 fatal_insn ("unknown insn mode", insn);
23982 return len;
23985 /* Compute default value for "length_address" attribute. */
23987 ix86_attr_length_address_default (rtx insn)
23989 int i;
23991 if (get_attr_type (insn) == TYPE_LEA)
23993 rtx set = PATTERN (insn), addr;
23995 if (GET_CODE (set) == PARALLEL)
23996 set = XVECEXP (set, 0, 0);
23998 gcc_assert (GET_CODE (set) == SET);
24000 addr = SET_SRC (set);
24002 return memory_address_length (addr, true);
24005 extract_insn_cached (insn);
24006 for (i = recog_data.n_operands - 1; i >= 0; --i)
24007 if (MEM_P (recog_data.operand[i]))
24009 constrain_operands_cached (reload_completed);
24010 if (which_alternative != -1)
24012 const char *constraints = recog_data.constraints[i];
24013 int alt = which_alternative;
24015 while (*constraints == '=' || *constraints == '+')
24016 constraints++;
24017 while (alt-- > 0)
24018 while (*constraints++ != ',')
24020 /* Skip ignored operands. */
24021 if (*constraints == 'X')
24022 continue;
24024 return memory_address_length (XEXP (recog_data.operand[i], 0), false);
24026 return 0;
24029 /* Compute default value for "length_vex" attribute. It includes
24030 2 or 3 byte VEX prefix and 1 opcode byte. */
24033 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
24035 int i;
24037 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
24038 byte VEX prefix. */
24039 if (!has_0f_opcode || has_vex_w)
24040 return 3 + 1;
24042 /* We can always use 2 byte VEX prefix in 32bit. */
24043 if (!TARGET_64BIT)
24044 return 2 + 1;
24046 extract_insn_cached (insn);
24048 for (i = recog_data.n_operands - 1; i >= 0; --i)
24049 if (REG_P (recog_data.operand[i]))
24051 /* REX.W bit uses 3 byte VEX prefix. */
24052 if (GET_MODE (recog_data.operand[i]) == DImode
24053 && GENERAL_REG_P (recog_data.operand[i]))
24054 return 3 + 1;
24056 else
24058 /* REX.X or REX.B bits use 3 byte VEX prefix. */
24059 if (MEM_P (recog_data.operand[i])
24060 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
24061 return 3 + 1;
24064 return 2 + 1;
24067 /* Return the maximum number of instructions a cpu can issue. */
24069 static int
24070 ix86_issue_rate (void)
24072 switch (ix86_tune)
24074 case PROCESSOR_PENTIUM:
24075 case PROCESSOR_ATOM:
24076 case PROCESSOR_K6:
24077 case PROCESSOR_BTVER2:
24078 return 2;
24080 case PROCESSOR_PENTIUMPRO:
24081 case PROCESSOR_PENTIUM4:
24082 case PROCESSOR_CORE2:
24083 case PROCESSOR_COREI7:
24084 case PROCESSOR_HASWELL:
24085 case PROCESSOR_ATHLON:
24086 case PROCESSOR_K8:
24087 case PROCESSOR_AMDFAM10:
24088 case PROCESSOR_NOCONA:
24089 case PROCESSOR_GENERIC32:
24090 case PROCESSOR_GENERIC64:
24091 case PROCESSOR_BDVER1:
24092 case PROCESSOR_BDVER2:
24093 case PROCESSOR_BDVER3:
24094 case PROCESSOR_BTVER1:
24095 return 3;
24097 default:
24098 return 1;
24102 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
24103 by DEP_INSN and nothing set by DEP_INSN. */
24105 static bool
24106 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
24108 rtx set, set2;
24110 /* Simplify the test for uninteresting insns. */
24111 if (insn_type != TYPE_SETCC
24112 && insn_type != TYPE_ICMOV
24113 && insn_type != TYPE_FCMOV
24114 && insn_type != TYPE_IBR)
24115 return false;
24117 if ((set = single_set (dep_insn)) != 0)
24119 set = SET_DEST (set);
24120 set2 = NULL_RTX;
24122 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
24123 && XVECLEN (PATTERN (dep_insn), 0) == 2
24124 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
24125 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
24127 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
24128 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
24130 else
24131 return false;
24133 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
24134 return false;
24136 /* This test is true if the dependent insn reads the flags but
24137 not any other potentially set register. */
24138 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
24139 return false;
24141 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
24142 return false;
24144 return true;
24147 /* Return true iff USE_INSN has a memory address with operands set by
24148 SET_INSN. */
24150 bool
24151 ix86_agi_dependent (rtx set_insn, rtx use_insn)
24153 int i;
24154 extract_insn_cached (use_insn);
24155 for (i = recog_data.n_operands - 1; i >= 0; --i)
24156 if (MEM_P (recog_data.operand[i]))
24158 rtx addr = XEXP (recog_data.operand[i], 0);
24159 return modified_in_p (addr, set_insn) != 0;
24161 return false;
24164 static int
24165 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
24167 enum attr_type insn_type, dep_insn_type;
24168 enum attr_memory memory;
24169 rtx set, set2;
24170 int dep_insn_code_number;
24172 /* Anti and output dependencies have zero cost on all CPUs. */
24173 if (REG_NOTE_KIND (link) != 0)
24174 return 0;
24176 dep_insn_code_number = recog_memoized (dep_insn);
24178 /* If we can't recognize the insns, we can't really do anything. */
24179 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
24180 return cost;
24182 insn_type = get_attr_type (insn);
24183 dep_insn_type = get_attr_type (dep_insn);
24185 switch (ix86_tune)
24187 case PROCESSOR_PENTIUM:
24188 /* Address Generation Interlock adds a cycle of latency. */
24189 if (insn_type == TYPE_LEA)
24191 rtx addr = PATTERN (insn);
24193 if (GET_CODE (addr) == PARALLEL)
24194 addr = XVECEXP (addr, 0, 0);
24196 gcc_assert (GET_CODE (addr) == SET);
24198 addr = SET_SRC (addr);
24199 if (modified_in_p (addr, dep_insn))
24200 cost += 1;
24202 else if (ix86_agi_dependent (dep_insn, insn))
24203 cost += 1;
24205 /* ??? Compares pair with jump/setcc. */
24206 if (ix86_flags_dependent (insn, dep_insn, insn_type))
24207 cost = 0;
24209 /* Floating point stores require value to be ready one cycle earlier. */
24210 if (insn_type == TYPE_FMOV
24211 && get_attr_memory (insn) == MEMORY_STORE
24212 && !ix86_agi_dependent (dep_insn, insn))
24213 cost += 1;
24214 break;
24216 case PROCESSOR_PENTIUMPRO:
24217 memory = get_attr_memory (insn);
24219 /* INT->FP conversion is expensive. */
24220 if (get_attr_fp_int_src (dep_insn))
24221 cost += 5;
24223 /* There is one cycle extra latency between an FP op and a store. */
24224 if (insn_type == TYPE_FMOV
24225 && (set = single_set (dep_insn)) != NULL_RTX
24226 && (set2 = single_set (insn)) != NULL_RTX
24227 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
24228 && MEM_P (SET_DEST (set2)))
24229 cost += 1;
24231 /* Show ability of reorder buffer to hide latency of load by executing
24232 in parallel with previous instruction in case
24233 previous instruction is not needed to compute the address. */
24234 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24235 && !ix86_agi_dependent (dep_insn, insn))
24237 /* Claim moves to take one cycle, as core can issue one load
24238 at time and the next load can start cycle later. */
24239 if (dep_insn_type == TYPE_IMOV
24240 || dep_insn_type == TYPE_FMOV)
24241 cost = 1;
24242 else if (cost > 1)
24243 cost--;
24245 break;
24247 case PROCESSOR_K6:
24248 memory = get_attr_memory (insn);
24250 /* The esp dependency is resolved before the instruction is really
24251 finished. */
24252 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
24253 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
24254 return 1;
24256 /* INT->FP conversion is expensive. */
24257 if (get_attr_fp_int_src (dep_insn))
24258 cost += 5;
24260 /* Show ability of reorder buffer to hide latency of load by executing
24261 in parallel with previous instruction in case
24262 previous instruction is not needed to compute the address. */
24263 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24264 && !ix86_agi_dependent (dep_insn, insn))
24266 /* Claim moves to take one cycle, as core can issue one load
24267 at time and the next load can start cycle later. */
24268 if (dep_insn_type == TYPE_IMOV
24269 || dep_insn_type == TYPE_FMOV)
24270 cost = 1;
24271 else if (cost > 2)
24272 cost -= 2;
24273 else
24274 cost = 1;
24276 break;
24278 case PROCESSOR_ATHLON:
24279 case PROCESSOR_K8:
24280 case PROCESSOR_AMDFAM10:
24281 case PROCESSOR_BDVER1:
24282 case PROCESSOR_BDVER2:
24283 case PROCESSOR_BDVER3:
24284 case PROCESSOR_BTVER1:
24285 case PROCESSOR_BTVER2:
24286 case PROCESSOR_ATOM:
24287 case PROCESSOR_GENERIC32:
24288 case PROCESSOR_GENERIC64:
24289 memory = get_attr_memory (insn);
24291 /* Show ability of reorder buffer to hide latency of load by executing
24292 in parallel with previous instruction in case
24293 previous instruction is not needed to compute the address. */
24294 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24295 && !ix86_agi_dependent (dep_insn, insn))
24297 enum attr_unit unit = get_attr_unit (insn);
24298 int loadcost = 3;
24300 /* Because of the difference between the length of integer and
24301 floating unit pipeline preparation stages, the memory operands
24302 for floating point are cheaper.
24304 ??? For Athlon it the difference is most probably 2. */
24305 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
24306 loadcost = 3;
24307 else
24308 loadcost = TARGET_ATHLON ? 2 : 0;
24310 if (cost >= loadcost)
24311 cost -= loadcost;
24312 else
24313 cost = 0;
24316 default:
24317 break;
24320 return cost;
24323 /* How many alternative schedules to try. This should be as wide as the
24324 scheduling freedom in the DFA, but no wider. Making this value too
24325 large results extra work for the scheduler. */
24327 static int
24328 ia32_multipass_dfa_lookahead (void)
24330 switch (ix86_tune)
24332 case PROCESSOR_PENTIUM:
24333 return 2;
24335 case PROCESSOR_PENTIUMPRO:
24336 case PROCESSOR_K6:
24337 return 1;
24339 case PROCESSOR_CORE2:
24340 case PROCESSOR_COREI7:
24341 case PROCESSOR_HASWELL:
24342 case PROCESSOR_ATOM:
24343 /* Generally, we want haifa-sched:max_issue() to look ahead as far
24344 as many instructions can be executed on a cycle, i.e.,
24345 issue_rate. I wonder why tuning for many CPUs does not do this. */
24346 if (reload_completed)
24347 return ix86_issue_rate ();
24348 /* Don't use lookahead for pre-reload schedule to save compile time. */
24349 return 0;
24351 default:
24352 return 0;
24356 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
24357 execution. It is applied if
24358 (1) IMUL instruction is on the top of list;
24359 (2) There exists the only producer of independent IMUL instruction in
24360 ready list;
24361 (3) Put found producer on the top of ready list.
24362 Returns issue rate. */
24364 static int
24365 ix86_sched_reorder(FILE *dump, int sched_verbose, rtx *ready, int *pn_ready,
24366 int clock_var ATTRIBUTE_UNUSED)
24368 static int issue_rate = -1;
24369 int n_ready = *pn_ready;
24370 rtx insn, insn1, insn2;
24371 int i;
24372 sd_iterator_def sd_it;
24373 dep_t dep;
24374 int index = -1;
24376 /* Set up issue rate. */
24377 issue_rate = ix86_issue_rate();
24379 /* Do reodering for Atom only. */
24380 if (ix86_tune != PROCESSOR_ATOM)
24381 return issue_rate;
24382 /* Do not perform ready list reodering for pre-reload schedule pass. */
24383 if (!reload_completed)
24384 return issue_rate;
24385 /* Nothing to do if ready list contains only 1 instruction. */
24386 if (n_ready <= 1)
24387 return issue_rate;
24389 /* Check that IMUL instruction is on the top of ready list. */
24390 insn = ready[n_ready - 1];
24391 if (!NONDEBUG_INSN_P (insn))
24392 return issue_rate;
24393 insn = PATTERN (insn);
24394 if (GET_CODE (insn) == PARALLEL)
24395 insn = XVECEXP (insn, 0, 0);
24396 if (GET_CODE (insn) != SET)
24397 return issue_rate;
24398 if (!(GET_CODE (SET_SRC (insn)) == MULT
24399 && GET_MODE (SET_SRC (insn)) == SImode))
24400 return issue_rate;
24402 /* Search for producer of independent IMUL instruction. */
24403 for (i = n_ready - 2; i>= 0; i--)
24405 insn = ready[i];
24406 if (!NONDEBUG_INSN_P (insn))
24407 continue;
24408 /* Skip IMUL instruction. */
24409 insn2 = PATTERN (insn);
24410 if (GET_CODE (insn2) == PARALLEL)
24411 insn2 = XVECEXP (insn2, 0, 0);
24412 if (GET_CODE (insn2) == SET
24413 && GET_CODE (SET_SRC (insn2)) == MULT
24414 && GET_MODE (SET_SRC (insn2)) == SImode)
24415 continue;
24417 FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
24419 rtx con;
24420 con = DEP_CON (dep);
24421 if (!NONDEBUG_INSN_P (con))
24422 continue;
24423 insn1 = PATTERN (con);
24424 if (GET_CODE (insn1) == PARALLEL)
24425 insn1 = XVECEXP (insn1, 0, 0);
24427 if (GET_CODE (insn1) == SET
24428 && GET_CODE (SET_SRC (insn1)) == MULT
24429 && GET_MODE (SET_SRC (insn1)) == SImode)
24431 sd_iterator_def sd_it1;
24432 dep_t dep1;
24433 /* Check if there is no other dependee for IMUL. */
24434 index = i;
24435 FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
24437 rtx pro;
24438 pro = DEP_PRO (dep1);
24439 if (!NONDEBUG_INSN_P (pro))
24440 continue;
24441 if (pro != insn)
24442 index = -1;
24444 if (index >= 0)
24445 break;
24448 if (index >= 0)
24449 break;
24451 if (index < 0)
24452 return issue_rate; /* Didn't find IMUL producer. */
24454 if (sched_verbose > 1)
24455 fprintf(dump, ";;\tatom sched_reorder: swap %d and %d insns\n",
24456 INSN_UID (ready[index]), INSN_UID (ready[n_ready - 1]));
24458 /* Put IMUL producer (ready[index]) at the top of ready list. */
24459 insn1= ready[index];
24460 for (i = index; i < n_ready - 1; i++)
24461 ready[i] = ready[i + 1];
24462 ready[n_ready - 1] = insn1;
24464 return issue_rate;
24467 static bool
24468 ix86_class_likely_spilled_p (reg_class_t);
24470 /* Returns true if lhs of insn is HW function argument register and set up
24471 is_spilled to true if it is likely spilled HW register. */
24472 static bool
24473 insn_is_function_arg (rtx insn, bool* is_spilled)
24475 rtx dst;
24477 if (!NONDEBUG_INSN_P (insn))
24478 return false;
24479 /* Call instructions are not movable, ignore it. */
24480 if (CALL_P (insn))
24481 return false;
24482 insn = PATTERN (insn);
24483 if (GET_CODE (insn) == PARALLEL)
24484 insn = XVECEXP (insn, 0, 0);
24485 if (GET_CODE (insn) != SET)
24486 return false;
24487 dst = SET_DEST (insn);
24488 if (REG_P (dst) && HARD_REGISTER_P (dst)
24489 && ix86_function_arg_regno_p (REGNO (dst)))
24491 /* Is it likely spilled HW register? */
24492 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
24493 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
24494 *is_spilled = true;
24495 return true;
24497 return false;
24500 /* Add output dependencies for chain of function adjacent arguments if only
24501 there is a move to likely spilled HW register. Return first argument
24502 if at least one dependence was added or NULL otherwise. */
24503 static rtx
24504 add_parameter_dependencies (rtx call, rtx head)
24506 rtx insn;
24507 rtx last = call;
24508 rtx first_arg = NULL;
24509 bool is_spilled = false;
24511 head = PREV_INSN (head);
24513 /* Find nearest to call argument passing instruction. */
24514 while (true)
24516 last = PREV_INSN (last);
24517 if (last == head)
24518 return NULL;
24519 if (!NONDEBUG_INSN_P (last))
24520 continue;
24521 if (insn_is_function_arg (last, &is_spilled))
24522 break;
24523 return NULL;
24526 first_arg = last;
24527 while (true)
24529 insn = PREV_INSN (last);
24530 if (!INSN_P (insn))
24531 break;
24532 if (insn == head)
24533 break;
24534 if (!NONDEBUG_INSN_P (insn))
24536 last = insn;
24537 continue;
24539 if (insn_is_function_arg (insn, &is_spilled))
24541 /* Add output depdendence between two function arguments if chain
24542 of output arguments contains likely spilled HW registers. */
24543 if (is_spilled)
24544 add_dependence (last, insn, REG_DEP_OUTPUT);
24545 first_arg = last = insn;
24547 else
24548 break;
24550 if (!is_spilled)
24551 return NULL;
24552 return first_arg;
24555 /* Add output or anti dependency from insn to first_arg to restrict its code
24556 motion. */
24557 static void
24558 avoid_func_arg_motion (rtx first_arg, rtx insn)
24560 rtx set;
24561 rtx tmp;
24563 set = single_set (insn);
24564 if (!set)
24565 return;
24566 tmp = SET_DEST (set);
24567 if (REG_P (tmp))
24569 /* Add output dependency to the first function argument. */
24570 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
24571 return;
24573 /* Add anti dependency. */
24574 add_dependence (first_arg, insn, REG_DEP_ANTI);
24577 /* Avoid cross block motion of function argument through adding dependency
24578 from the first non-jump instruction in bb. */
24579 static void
24580 add_dependee_for_func_arg (rtx arg, basic_block bb)
24582 rtx insn = BB_END (bb);
24584 while (insn)
24586 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
24588 rtx set = single_set (insn);
24589 if (set)
24591 avoid_func_arg_motion (arg, insn);
24592 return;
24595 if (insn == BB_HEAD (bb))
24596 return;
24597 insn = PREV_INSN (insn);
24601 /* Hook for pre-reload schedule - avoid motion of function arguments
24602 passed in likely spilled HW registers. */
24603 static void
24604 ix86_dependencies_evaluation_hook (rtx head, rtx tail)
24606 rtx insn;
24607 rtx first_arg = NULL;
24608 if (reload_completed)
24609 return;
24610 while (head != tail && DEBUG_INSN_P (head))
24611 head = NEXT_INSN (head);
24612 for (insn = tail; insn != head; insn = PREV_INSN (insn))
24613 if (INSN_P (insn) && CALL_P (insn))
24615 first_arg = add_parameter_dependencies (insn, head);
24616 if (first_arg)
24618 /* Add dependee for first argument to predecessors if only
24619 region contains more than one block. */
24620 basic_block bb = BLOCK_FOR_INSN (insn);
24621 int rgn = CONTAINING_RGN (bb->index);
24622 int nr_blks = RGN_NR_BLOCKS (rgn);
24623 /* Skip trivial regions and region head blocks that can have
24624 predecessors outside of region. */
24625 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
24627 edge e;
24628 edge_iterator ei;
24629 /* Assume that region is SCC, i.e. all immediate predecessors
24630 of non-head block are in the same region. */
24631 FOR_EACH_EDGE (e, ei, bb->preds)
24633 /* Avoid creating of loop-carried dependencies through
24634 using topological odering in region. */
24635 if (BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
24636 add_dependee_for_func_arg (first_arg, e->src);
24639 insn = first_arg;
24640 if (insn == head)
24641 break;
24644 else if (first_arg)
24645 avoid_func_arg_motion (first_arg, insn);
24648 /* Hook for pre-reload schedule - set priority of moves from likely spilled
24649 HW registers to maximum, to schedule them at soon as possible. These are
24650 moves from function argument registers at the top of the function entry
24651 and moves from function return value registers after call. */
24652 static int
24653 ix86_adjust_priority (rtx insn, int priority)
24655 rtx set;
24657 if (reload_completed)
24658 return priority;
24660 if (!NONDEBUG_INSN_P (insn))
24661 return priority;
24663 set = single_set (insn);
24664 if (set)
24666 rtx tmp = SET_SRC (set);
24667 if (REG_P (tmp)
24668 && HARD_REGISTER_P (tmp)
24669 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
24670 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
24671 return current_sched_info->sched_max_insns_priority;
24674 return priority;
24677 /* Model decoder of Core 2/i7.
24678 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
24679 track the instruction fetch block boundaries and make sure that long
24680 (9+ bytes) instructions are assigned to D0. */
24682 /* Maximum length of an insn that can be handled by
24683 a secondary decoder unit. '8' for Core 2/i7. */
24684 static int core2i7_secondary_decoder_max_insn_size;
24686 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
24687 '16' for Core 2/i7. */
24688 static int core2i7_ifetch_block_size;
24690 /* Maximum number of instructions decoder can handle per cycle.
24691 '6' for Core 2/i7. */
24692 static int core2i7_ifetch_block_max_insns;
24694 typedef struct ix86_first_cycle_multipass_data_ *
24695 ix86_first_cycle_multipass_data_t;
24696 typedef const struct ix86_first_cycle_multipass_data_ *
24697 const_ix86_first_cycle_multipass_data_t;
24699 /* A variable to store target state across calls to max_issue within
24700 one cycle. */
24701 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
24702 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
24704 /* Initialize DATA. */
24705 static void
24706 core2i7_first_cycle_multipass_init (void *_data)
24708 ix86_first_cycle_multipass_data_t data
24709 = (ix86_first_cycle_multipass_data_t) _data;
24711 data->ifetch_block_len = 0;
24712 data->ifetch_block_n_insns = 0;
24713 data->ready_try_change = NULL;
24714 data->ready_try_change_size = 0;
24717 /* Advancing the cycle; reset ifetch block counts. */
24718 static void
24719 core2i7_dfa_post_advance_cycle (void)
24721 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
24723 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
24725 data->ifetch_block_len = 0;
24726 data->ifetch_block_n_insns = 0;
24729 static int min_insn_size (rtx);
24731 /* Filter out insns from ready_try that the core will not be able to issue
24732 on current cycle due to decoder. */
24733 static void
24734 core2i7_first_cycle_multipass_filter_ready_try
24735 (const_ix86_first_cycle_multipass_data_t data,
24736 char *ready_try, int n_ready, bool first_cycle_insn_p)
24738 while (n_ready--)
24740 rtx insn;
24741 int insn_size;
24743 if (ready_try[n_ready])
24744 continue;
24746 insn = get_ready_element (n_ready);
24747 insn_size = min_insn_size (insn);
24749 if (/* If this is a too long an insn for a secondary decoder ... */
24750 (!first_cycle_insn_p
24751 && insn_size > core2i7_secondary_decoder_max_insn_size)
24752 /* ... or it would not fit into the ifetch block ... */
24753 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
24754 /* ... or the decoder is full already ... */
24755 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
24756 /* ... mask the insn out. */
24758 ready_try[n_ready] = 1;
24760 if (data->ready_try_change)
24761 bitmap_set_bit (data->ready_try_change, n_ready);
24766 /* Prepare for a new round of multipass lookahead scheduling. */
24767 static void
24768 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
24769 bool first_cycle_insn_p)
24771 ix86_first_cycle_multipass_data_t data
24772 = (ix86_first_cycle_multipass_data_t) _data;
24773 const_ix86_first_cycle_multipass_data_t prev_data
24774 = ix86_first_cycle_multipass_data;
24776 /* Restore the state from the end of the previous round. */
24777 data->ifetch_block_len = prev_data->ifetch_block_len;
24778 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
24780 /* Filter instructions that cannot be issued on current cycle due to
24781 decoder restrictions. */
24782 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24783 first_cycle_insn_p);
24786 /* INSN is being issued in current solution. Account for its impact on
24787 the decoder model. */
24788 static void
24789 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
24790 rtx insn, const void *_prev_data)
24792 ix86_first_cycle_multipass_data_t data
24793 = (ix86_first_cycle_multipass_data_t) _data;
24794 const_ix86_first_cycle_multipass_data_t prev_data
24795 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
24797 int insn_size = min_insn_size (insn);
24799 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
24800 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
24801 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
24802 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
24804 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
24805 if (!data->ready_try_change)
24807 data->ready_try_change = sbitmap_alloc (n_ready);
24808 data->ready_try_change_size = n_ready;
24810 else if (data->ready_try_change_size < n_ready)
24812 data->ready_try_change = sbitmap_resize (data->ready_try_change,
24813 n_ready, 0);
24814 data->ready_try_change_size = n_ready;
24816 bitmap_clear (data->ready_try_change);
24818 /* Filter out insns from ready_try that the core will not be able to issue
24819 on current cycle due to decoder. */
24820 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24821 false);
24824 /* Revert the effect on ready_try. */
24825 static void
24826 core2i7_first_cycle_multipass_backtrack (const void *_data,
24827 char *ready_try,
24828 int n_ready ATTRIBUTE_UNUSED)
24830 const_ix86_first_cycle_multipass_data_t data
24831 = (const_ix86_first_cycle_multipass_data_t) _data;
24832 unsigned int i = 0;
24833 sbitmap_iterator sbi;
24835 gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready);
24836 EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi)
24838 ready_try[i] = 0;
24842 /* Save the result of multipass lookahead scheduling for the next round. */
24843 static void
24844 core2i7_first_cycle_multipass_end (const void *_data)
24846 const_ix86_first_cycle_multipass_data_t data
24847 = (const_ix86_first_cycle_multipass_data_t) _data;
24848 ix86_first_cycle_multipass_data_t next_data
24849 = ix86_first_cycle_multipass_data;
24851 if (data != NULL)
24853 next_data->ifetch_block_len = data->ifetch_block_len;
24854 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
24858 /* Deallocate target data. */
24859 static void
24860 core2i7_first_cycle_multipass_fini (void *_data)
24862 ix86_first_cycle_multipass_data_t data
24863 = (ix86_first_cycle_multipass_data_t) _data;
24865 if (data->ready_try_change)
24867 sbitmap_free (data->ready_try_change);
24868 data->ready_try_change = NULL;
24869 data->ready_try_change_size = 0;
24873 /* Prepare for scheduling pass. */
24874 static void
24875 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
24876 int verbose ATTRIBUTE_UNUSED,
24877 int max_uid ATTRIBUTE_UNUSED)
24879 /* Install scheduling hooks for current CPU. Some of these hooks are used
24880 in time-critical parts of the scheduler, so we only set them up when
24881 they are actually used. */
24882 switch (ix86_tune)
24884 case PROCESSOR_CORE2:
24885 case PROCESSOR_COREI7:
24886 case PROCESSOR_HASWELL:
24887 /* Do not perform multipass scheduling for pre-reload schedule
24888 to save compile time. */
24889 if (reload_completed)
24891 targetm.sched.dfa_post_advance_cycle
24892 = core2i7_dfa_post_advance_cycle;
24893 targetm.sched.first_cycle_multipass_init
24894 = core2i7_first_cycle_multipass_init;
24895 targetm.sched.first_cycle_multipass_begin
24896 = core2i7_first_cycle_multipass_begin;
24897 targetm.sched.first_cycle_multipass_issue
24898 = core2i7_first_cycle_multipass_issue;
24899 targetm.sched.first_cycle_multipass_backtrack
24900 = core2i7_first_cycle_multipass_backtrack;
24901 targetm.sched.first_cycle_multipass_end
24902 = core2i7_first_cycle_multipass_end;
24903 targetm.sched.first_cycle_multipass_fini
24904 = core2i7_first_cycle_multipass_fini;
24906 /* Set decoder parameters. */
24907 core2i7_secondary_decoder_max_insn_size = 8;
24908 core2i7_ifetch_block_size = 16;
24909 core2i7_ifetch_block_max_insns = 6;
24910 break;
24912 /* ... Fall through ... */
24913 default:
24914 targetm.sched.dfa_post_advance_cycle = NULL;
24915 targetm.sched.first_cycle_multipass_init = NULL;
24916 targetm.sched.first_cycle_multipass_begin = NULL;
24917 targetm.sched.first_cycle_multipass_issue = NULL;
24918 targetm.sched.first_cycle_multipass_backtrack = NULL;
24919 targetm.sched.first_cycle_multipass_end = NULL;
24920 targetm.sched.first_cycle_multipass_fini = NULL;
24921 break;
24926 /* Compute the alignment given to a constant that is being placed in memory.
24927 EXP is the constant and ALIGN is the alignment that the object would
24928 ordinarily have.
24929 The value of this function is used instead of that alignment to align
24930 the object. */
24933 ix86_constant_alignment (tree exp, int align)
24935 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
24936 || TREE_CODE (exp) == INTEGER_CST)
24938 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
24939 return 64;
24940 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
24941 return 128;
24943 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
24944 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
24945 return BITS_PER_WORD;
24947 return align;
24950 /* Compute the alignment for a static variable.
24951 TYPE is the data type, and ALIGN is the alignment that
24952 the object would ordinarily have. The value of this function is used
24953 instead of that alignment to align the object. */
24956 ix86_data_alignment (tree type, int align)
24958 int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
24960 if (AGGREGATE_TYPE_P (type)
24961 && TYPE_SIZE (type)
24962 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24963 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
24964 || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
24965 && align < max_align)
24966 align = max_align;
24968 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
24969 to 16byte boundary. */
24970 if (TARGET_64BIT)
24972 if (AGGREGATE_TYPE_P (type)
24973 && TYPE_SIZE (type)
24974 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24975 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
24976 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24977 return 128;
24980 if (TREE_CODE (type) == ARRAY_TYPE)
24982 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24983 return 64;
24984 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
24985 return 128;
24987 else if (TREE_CODE (type) == COMPLEX_TYPE)
24990 if (TYPE_MODE (type) == DCmode && align < 64)
24991 return 64;
24992 if ((TYPE_MODE (type) == XCmode
24993 || TYPE_MODE (type) == TCmode) && align < 128)
24994 return 128;
24996 else if ((TREE_CODE (type) == RECORD_TYPE
24997 || TREE_CODE (type) == UNION_TYPE
24998 || TREE_CODE (type) == QUAL_UNION_TYPE)
24999 && TYPE_FIELDS (type))
25001 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
25002 return 64;
25003 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
25004 return 128;
25006 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
25007 || TREE_CODE (type) == INTEGER_TYPE)
25009 if (TYPE_MODE (type) == DFmode && align < 64)
25010 return 64;
25011 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
25012 return 128;
25015 return align;
25018 /* Compute the alignment for a local variable or a stack slot. EXP is
25019 the data type or decl itself, MODE is the widest mode available and
25020 ALIGN is the alignment that the object would ordinarily have. The
25021 value of this macro is used instead of that alignment to align the
25022 object. */
25024 unsigned int
25025 ix86_local_alignment (tree exp, enum machine_mode mode,
25026 unsigned int align)
25028 tree type, decl;
25030 if (exp && DECL_P (exp))
25032 type = TREE_TYPE (exp);
25033 decl = exp;
25035 else
25037 type = exp;
25038 decl = NULL;
25041 /* Don't do dynamic stack realignment for long long objects with
25042 -mpreferred-stack-boundary=2. */
25043 if (!TARGET_64BIT
25044 && align == 64
25045 && ix86_preferred_stack_boundary < 64
25046 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
25047 && (!type || !TYPE_USER_ALIGN (type))
25048 && (!decl || !DECL_USER_ALIGN (decl)))
25049 align = 32;
25051 /* If TYPE is NULL, we are allocating a stack slot for caller-save
25052 register in MODE. We will return the largest alignment of XF
25053 and DF. */
25054 if (!type)
25056 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
25057 align = GET_MODE_ALIGNMENT (DFmode);
25058 return align;
25061 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
25062 to 16byte boundary. Exact wording is:
25064 An array uses the same alignment as its elements, except that a local or
25065 global array variable of length at least 16 bytes or
25066 a C99 variable-length array variable always has alignment of at least 16 bytes.
25068 This was added to allow use of aligned SSE instructions at arrays. This
25069 rule is meant for static storage (where compiler can not do the analysis
25070 by itself). We follow it for automatic variables only when convenient.
25071 We fully control everything in the function compiled and functions from
25072 other unit can not rely on the alignment.
25074 Exclude va_list type. It is the common case of local array where
25075 we can not benefit from the alignment. */
25076 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
25077 && TARGET_SSE)
25079 if (AGGREGATE_TYPE_P (type)
25080 && (va_list_type_node == NULL_TREE
25081 || (TYPE_MAIN_VARIANT (type)
25082 != TYPE_MAIN_VARIANT (va_list_type_node)))
25083 && TYPE_SIZE (type)
25084 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
25085 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
25086 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
25087 return 128;
25089 if (TREE_CODE (type) == ARRAY_TYPE)
25091 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
25092 return 64;
25093 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
25094 return 128;
25096 else if (TREE_CODE (type) == COMPLEX_TYPE)
25098 if (TYPE_MODE (type) == DCmode && align < 64)
25099 return 64;
25100 if ((TYPE_MODE (type) == XCmode
25101 || TYPE_MODE (type) == TCmode) && align < 128)
25102 return 128;
25104 else if ((TREE_CODE (type) == RECORD_TYPE
25105 || TREE_CODE (type) == UNION_TYPE
25106 || TREE_CODE (type) == QUAL_UNION_TYPE)
25107 && TYPE_FIELDS (type))
25109 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
25110 return 64;
25111 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
25112 return 128;
25114 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
25115 || TREE_CODE (type) == INTEGER_TYPE)
25118 if (TYPE_MODE (type) == DFmode && align < 64)
25119 return 64;
25120 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
25121 return 128;
25123 return align;
25126 /* Compute the minimum required alignment for dynamic stack realignment
25127 purposes for a local variable, parameter or a stack slot. EXP is
25128 the data type or decl itself, MODE is its mode and ALIGN is the
25129 alignment that the object would ordinarily have. */
25131 unsigned int
25132 ix86_minimum_alignment (tree exp, enum machine_mode mode,
25133 unsigned int align)
25135 tree type, decl;
25137 if (exp && DECL_P (exp))
25139 type = TREE_TYPE (exp);
25140 decl = exp;
25142 else
25144 type = exp;
25145 decl = NULL;
25148 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
25149 return align;
25151 /* Don't do dynamic stack realignment for long long objects with
25152 -mpreferred-stack-boundary=2. */
25153 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
25154 && (!type || !TYPE_USER_ALIGN (type))
25155 && (!decl || !DECL_USER_ALIGN (decl)))
25156 return 32;
25158 return align;
25161 /* Find a location for the static chain incoming to a nested function.
25162 This is a register, unless all free registers are used by arguments. */
25164 static rtx
25165 ix86_static_chain (const_tree fndecl, bool incoming_p)
25167 unsigned regno;
25169 if (!DECL_STATIC_CHAIN (fndecl))
25170 return NULL;
25172 if (TARGET_64BIT)
25174 /* We always use R10 in 64-bit mode. */
25175 regno = R10_REG;
25177 else
25179 tree fntype;
25180 unsigned int ccvt;
25182 /* By default in 32-bit mode we use ECX to pass the static chain. */
25183 regno = CX_REG;
25185 fntype = TREE_TYPE (fndecl);
25186 ccvt = ix86_get_callcvt (fntype);
25187 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
25189 /* Fastcall functions use ecx/edx for arguments, which leaves
25190 us with EAX for the static chain.
25191 Thiscall functions use ecx for arguments, which also
25192 leaves us with EAX for the static chain. */
25193 regno = AX_REG;
25195 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
25197 /* Thiscall functions use ecx for arguments, which leaves
25198 us with EAX and EDX for the static chain.
25199 We are using for abi-compatibility EAX. */
25200 regno = AX_REG;
25202 else if (ix86_function_regparm (fntype, fndecl) == 3)
25204 /* For regparm 3, we have no free call-clobbered registers in
25205 which to store the static chain. In order to implement this,
25206 we have the trampoline push the static chain to the stack.
25207 However, we can't push a value below the return address when
25208 we call the nested function directly, so we have to use an
25209 alternate entry point. For this we use ESI, and have the
25210 alternate entry point push ESI, so that things appear the
25211 same once we're executing the nested function. */
25212 if (incoming_p)
25214 if (fndecl == current_function_decl)
25215 ix86_static_chain_on_stack = true;
25216 return gen_frame_mem (SImode,
25217 plus_constant (Pmode,
25218 arg_pointer_rtx, -8));
25220 regno = SI_REG;
25224 return gen_rtx_REG (Pmode, regno);
25227 /* Emit RTL insns to initialize the variable parts of a trampoline.
25228 FNDECL is the decl of the target address; M_TRAMP is a MEM for
25229 the trampoline, and CHAIN_VALUE is an RTX for the static chain
25230 to be passed to the target function. */
25232 static void
25233 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
25235 rtx mem, fnaddr;
25236 int opcode;
25237 int offset = 0;
25239 fnaddr = XEXP (DECL_RTL (fndecl), 0);
25241 if (TARGET_64BIT)
25243 int size;
25245 /* Load the function address to r11. Try to load address using
25246 the shorter movl instead of movabs. We may want to support
25247 movq for kernel mode, but kernel does not use trampolines at
25248 the moment. FNADDR is a 32bit address and may not be in
25249 DImode when ptr_mode == SImode. Always use movl in this
25250 case. */
25251 if (ptr_mode == SImode
25252 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
25254 fnaddr = copy_addr_to_reg (fnaddr);
25256 mem = adjust_address (m_tramp, HImode, offset);
25257 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
25259 mem = adjust_address (m_tramp, SImode, offset + 2);
25260 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
25261 offset += 6;
25263 else
25265 mem = adjust_address (m_tramp, HImode, offset);
25266 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
25268 mem = adjust_address (m_tramp, DImode, offset + 2);
25269 emit_move_insn (mem, fnaddr);
25270 offset += 10;
25273 /* Load static chain using movabs to r10. Use the shorter movl
25274 instead of movabs when ptr_mode == SImode. */
25275 if (ptr_mode == SImode)
25277 opcode = 0xba41;
25278 size = 6;
25280 else
25282 opcode = 0xba49;
25283 size = 10;
25286 mem = adjust_address (m_tramp, HImode, offset);
25287 emit_move_insn (mem, gen_int_mode (opcode, HImode));
25289 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
25290 emit_move_insn (mem, chain_value);
25291 offset += size;
25293 /* Jump to r11; the last (unused) byte is a nop, only there to
25294 pad the write out to a single 32-bit store. */
25295 mem = adjust_address (m_tramp, SImode, offset);
25296 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
25297 offset += 4;
25299 else
25301 rtx disp, chain;
25303 /* Depending on the static chain location, either load a register
25304 with a constant, or push the constant to the stack. All of the
25305 instructions are the same size. */
25306 chain = ix86_static_chain (fndecl, true);
25307 if (REG_P (chain))
25309 switch (REGNO (chain))
25311 case AX_REG:
25312 opcode = 0xb8; break;
25313 case CX_REG:
25314 opcode = 0xb9; break;
25315 default:
25316 gcc_unreachable ();
25319 else
25320 opcode = 0x68;
25322 mem = adjust_address (m_tramp, QImode, offset);
25323 emit_move_insn (mem, gen_int_mode (opcode, QImode));
25325 mem = adjust_address (m_tramp, SImode, offset + 1);
25326 emit_move_insn (mem, chain_value);
25327 offset += 5;
25329 mem = adjust_address (m_tramp, QImode, offset);
25330 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
25332 mem = adjust_address (m_tramp, SImode, offset + 1);
25334 /* Compute offset from the end of the jmp to the target function.
25335 In the case in which the trampoline stores the static chain on
25336 the stack, we need to skip the first insn which pushes the
25337 (call-saved) register static chain; this push is 1 byte. */
25338 offset += 5;
25339 disp = expand_binop (SImode, sub_optab, fnaddr,
25340 plus_constant (Pmode, XEXP (m_tramp, 0),
25341 offset - (MEM_P (chain) ? 1 : 0)),
25342 NULL_RTX, 1, OPTAB_DIRECT);
25343 emit_move_insn (mem, disp);
25346 gcc_assert (offset <= TRAMPOLINE_SIZE);
25348 #ifdef HAVE_ENABLE_EXECUTE_STACK
25349 #ifdef CHECK_EXECUTE_STACK_ENABLED
25350 if (CHECK_EXECUTE_STACK_ENABLED)
25351 #endif
25352 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
25353 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
25354 #endif
25357 /* The following file contains several enumerations and data structures
25358 built from the definitions in i386-builtin-types.def. */
25360 #include "i386-builtin-types.inc"
25362 /* Table for the ix86 builtin non-function types. */
25363 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
25365 /* Retrieve an element from the above table, building some of
25366 the types lazily. */
25368 static tree
25369 ix86_get_builtin_type (enum ix86_builtin_type tcode)
25371 unsigned int index;
25372 tree type, itype;
25374 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
25376 type = ix86_builtin_type_tab[(int) tcode];
25377 if (type != NULL)
25378 return type;
25380 gcc_assert (tcode > IX86_BT_LAST_PRIM);
25381 if (tcode <= IX86_BT_LAST_VECT)
25383 enum machine_mode mode;
25385 index = tcode - IX86_BT_LAST_PRIM - 1;
25386 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
25387 mode = ix86_builtin_type_vect_mode[index];
25389 type = build_vector_type_for_mode (itype, mode);
25391 else
25393 int quals;
25395 index = tcode - IX86_BT_LAST_VECT - 1;
25396 if (tcode <= IX86_BT_LAST_PTR)
25397 quals = TYPE_UNQUALIFIED;
25398 else
25399 quals = TYPE_QUAL_CONST;
25401 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
25402 if (quals != TYPE_UNQUALIFIED)
25403 itype = build_qualified_type (itype, quals);
25405 type = build_pointer_type (itype);
25408 ix86_builtin_type_tab[(int) tcode] = type;
25409 return type;
25412 /* Table for the ix86 builtin function types. */
25413 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
25415 /* Retrieve an element from the above table, building some of
25416 the types lazily. */
25418 static tree
25419 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
25421 tree type;
25423 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
25425 type = ix86_builtin_func_type_tab[(int) tcode];
25426 if (type != NULL)
25427 return type;
25429 if (tcode <= IX86_BT_LAST_FUNC)
25431 unsigned start = ix86_builtin_func_start[(int) tcode];
25432 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
25433 tree rtype, atype, args = void_list_node;
25434 unsigned i;
25436 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
25437 for (i = after - 1; i > start; --i)
25439 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
25440 args = tree_cons (NULL, atype, args);
25443 type = build_function_type (rtype, args);
25445 else
25447 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
25448 enum ix86_builtin_func_type icode;
25450 icode = ix86_builtin_func_alias_base[index];
25451 type = ix86_get_builtin_func_type (icode);
25454 ix86_builtin_func_type_tab[(int) tcode] = type;
25455 return type;
25459 /* Codes for all the SSE/MMX builtins. */
25460 enum ix86_builtins
25462 IX86_BUILTIN_ADDPS,
25463 IX86_BUILTIN_ADDSS,
25464 IX86_BUILTIN_DIVPS,
25465 IX86_BUILTIN_DIVSS,
25466 IX86_BUILTIN_MULPS,
25467 IX86_BUILTIN_MULSS,
25468 IX86_BUILTIN_SUBPS,
25469 IX86_BUILTIN_SUBSS,
25471 IX86_BUILTIN_CMPEQPS,
25472 IX86_BUILTIN_CMPLTPS,
25473 IX86_BUILTIN_CMPLEPS,
25474 IX86_BUILTIN_CMPGTPS,
25475 IX86_BUILTIN_CMPGEPS,
25476 IX86_BUILTIN_CMPNEQPS,
25477 IX86_BUILTIN_CMPNLTPS,
25478 IX86_BUILTIN_CMPNLEPS,
25479 IX86_BUILTIN_CMPNGTPS,
25480 IX86_BUILTIN_CMPNGEPS,
25481 IX86_BUILTIN_CMPORDPS,
25482 IX86_BUILTIN_CMPUNORDPS,
25483 IX86_BUILTIN_CMPEQSS,
25484 IX86_BUILTIN_CMPLTSS,
25485 IX86_BUILTIN_CMPLESS,
25486 IX86_BUILTIN_CMPNEQSS,
25487 IX86_BUILTIN_CMPNLTSS,
25488 IX86_BUILTIN_CMPNLESS,
25489 IX86_BUILTIN_CMPNGTSS,
25490 IX86_BUILTIN_CMPNGESS,
25491 IX86_BUILTIN_CMPORDSS,
25492 IX86_BUILTIN_CMPUNORDSS,
25494 IX86_BUILTIN_COMIEQSS,
25495 IX86_BUILTIN_COMILTSS,
25496 IX86_BUILTIN_COMILESS,
25497 IX86_BUILTIN_COMIGTSS,
25498 IX86_BUILTIN_COMIGESS,
25499 IX86_BUILTIN_COMINEQSS,
25500 IX86_BUILTIN_UCOMIEQSS,
25501 IX86_BUILTIN_UCOMILTSS,
25502 IX86_BUILTIN_UCOMILESS,
25503 IX86_BUILTIN_UCOMIGTSS,
25504 IX86_BUILTIN_UCOMIGESS,
25505 IX86_BUILTIN_UCOMINEQSS,
25507 IX86_BUILTIN_CVTPI2PS,
25508 IX86_BUILTIN_CVTPS2PI,
25509 IX86_BUILTIN_CVTSI2SS,
25510 IX86_BUILTIN_CVTSI642SS,
25511 IX86_BUILTIN_CVTSS2SI,
25512 IX86_BUILTIN_CVTSS2SI64,
25513 IX86_BUILTIN_CVTTPS2PI,
25514 IX86_BUILTIN_CVTTSS2SI,
25515 IX86_BUILTIN_CVTTSS2SI64,
25517 IX86_BUILTIN_MAXPS,
25518 IX86_BUILTIN_MAXSS,
25519 IX86_BUILTIN_MINPS,
25520 IX86_BUILTIN_MINSS,
25522 IX86_BUILTIN_LOADUPS,
25523 IX86_BUILTIN_STOREUPS,
25524 IX86_BUILTIN_MOVSS,
25526 IX86_BUILTIN_MOVHLPS,
25527 IX86_BUILTIN_MOVLHPS,
25528 IX86_BUILTIN_LOADHPS,
25529 IX86_BUILTIN_LOADLPS,
25530 IX86_BUILTIN_STOREHPS,
25531 IX86_BUILTIN_STORELPS,
25533 IX86_BUILTIN_MASKMOVQ,
25534 IX86_BUILTIN_MOVMSKPS,
25535 IX86_BUILTIN_PMOVMSKB,
25537 IX86_BUILTIN_MOVNTPS,
25538 IX86_BUILTIN_MOVNTQ,
25540 IX86_BUILTIN_LOADDQU,
25541 IX86_BUILTIN_STOREDQU,
25543 IX86_BUILTIN_PACKSSWB,
25544 IX86_BUILTIN_PACKSSDW,
25545 IX86_BUILTIN_PACKUSWB,
25547 IX86_BUILTIN_PADDB,
25548 IX86_BUILTIN_PADDW,
25549 IX86_BUILTIN_PADDD,
25550 IX86_BUILTIN_PADDQ,
25551 IX86_BUILTIN_PADDSB,
25552 IX86_BUILTIN_PADDSW,
25553 IX86_BUILTIN_PADDUSB,
25554 IX86_BUILTIN_PADDUSW,
25555 IX86_BUILTIN_PSUBB,
25556 IX86_BUILTIN_PSUBW,
25557 IX86_BUILTIN_PSUBD,
25558 IX86_BUILTIN_PSUBQ,
25559 IX86_BUILTIN_PSUBSB,
25560 IX86_BUILTIN_PSUBSW,
25561 IX86_BUILTIN_PSUBUSB,
25562 IX86_BUILTIN_PSUBUSW,
25564 IX86_BUILTIN_PAND,
25565 IX86_BUILTIN_PANDN,
25566 IX86_BUILTIN_POR,
25567 IX86_BUILTIN_PXOR,
25569 IX86_BUILTIN_PAVGB,
25570 IX86_BUILTIN_PAVGW,
25572 IX86_BUILTIN_PCMPEQB,
25573 IX86_BUILTIN_PCMPEQW,
25574 IX86_BUILTIN_PCMPEQD,
25575 IX86_BUILTIN_PCMPGTB,
25576 IX86_BUILTIN_PCMPGTW,
25577 IX86_BUILTIN_PCMPGTD,
25579 IX86_BUILTIN_PMADDWD,
25581 IX86_BUILTIN_PMAXSW,
25582 IX86_BUILTIN_PMAXUB,
25583 IX86_BUILTIN_PMINSW,
25584 IX86_BUILTIN_PMINUB,
25586 IX86_BUILTIN_PMULHUW,
25587 IX86_BUILTIN_PMULHW,
25588 IX86_BUILTIN_PMULLW,
25590 IX86_BUILTIN_PSADBW,
25591 IX86_BUILTIN_PSHUFW,
25593 IX86_BUILTIN_PSLLW,
25594 IX86_BUILTIN_PSLLD,
25595 IX86_BUILTIN_PSLLQ,
25596 IX86_BUILTIN_PSRAW,
25597 IX86_BUILTIN_PSRAD,
25598 IX86_BUILTIN_PSRLW,
25599 IX86_BUILTIN_PSRLD,
25600 IX86_BUILTIN_PSRLQ,
25601 IX86_BUILTIN_PSLLWI,
25602 IX86_BUILTIN_PSLLDI,
25603 IX86_BUILTIN_PSLLQI,
25604 IX86_BUILTIN_PSRAWI,
25605 IX86_BUILTIN_PSRADI,
25606 IX86_BUILTIN_PSRLWI,
25607 IX86_BUILTIN_PSRLDI,
25608 IX86_BUILTIN_PSRLQI,
25610 IX86_BUILTIN_PUNPCKHBW,
25611 IX86_BUILTIN_PUNPCKHWD,
25612 IX86_BUILTIN_PUNPCKHDQ,
25613 IX86_BUILTIN_PUNPCKLBW,
25614 IX86_BUILTIN_PUNPCKLWD,
25615 IX86_BUILTIN_PUNPCKLDQ,
25617 IX86_BUILTIN_SHUFPS,
25619 IX86_BUILTIN_RCPPS,
25620 IX86_BUILTIN_RCPSS,
25621 IX86_BUILTIN_RSQRTPS,
25622 IX86_BUILTIN_RSQRTPS_NR,
25623 IX86_BUILTIN_RSQRTSS,
25624 IX86_BUILTIN_RSQRTF,
25625 IX86_BUILTIN_SQRTPS,
25626 IX86_BUILTIN_SQRTPS_NR,
25627 IX86_BUILTIN_SQRTSS,
25629 IX86_BUILTIN_UNPCKHPS,
25630 IX86_BUILTIN_UNPCKLPS,
25632 IX86_BUILTIN_ANDPS,
25633 IX86_BUILTIN_ANDNPS,
25634 IX86_BUILTIN_ORPS,
25635 IX86_BUILTIN_XORPS,
25637 IX86_BUILTIN_EMMS,
25638 IX86_BUILTIN_LDMXCSR,
25639 IX86_BUILTIN_STMXCSR,
25640 IX86_BUILTIN_SFENCE,
25642 IX86_BUILTIN_FXSAVE,
25643 IX86_BUILTIN_FXRSTOR,
25644 IX86_BUILTIN_FXSAVE64,
25645 IX86_BUILTIN_FXRSTOR64,
25647 IX86_BUILTIN_XSAVE,
25648 IX86_BUILTIN_XRSTOR,
25649 IX86_BUILTIN_XSAVE64,
25650 IX86_BUILTIN_XRSTOR64,
25652 IX86_BUILTIN_XSAVEOPT,
25653 IX86_BUILTIN_XSAVEOPT64,
25655 /* 3DNow! Original */
25656 IX86_BUILTIN_FEMMS,
25657 IX86_BUILTIN_PAVGUSB,
25658 IX86_BUILTIN_PF2ID,
25659 IX86_BUILTIN_PFACC,
25660 IX86_BUILTIN_PFADD,
25661 IX86_BUILTIN_PFCMPEQ,
25662 IX86_BUILTIN_PFCMPGE,
25663 IX86_BUILTIN_PFCMPGT,
25664 IX86_BUILTIN_PFMAX,
25665 IX86_BUILTIN_PFMIN,
25666 IX86_BUILTIN_PFMUL,
25667 IX86_BUILTIN_PFRCP,
25668 IX86_BUILTIN_PFRCPIT1,
25669 IX86_BUILTIN_PFRCPIT2,
25670 IX86_BUILTIN_PFRSQIT1,
25671 IX86_BUILTIN_PFRSQRT,
25672 IX86_BUILTIN_PFSUB,
25673 IX86_BUILTIN_PFSUBR,
25674 IX86_BUILTIN_PI2FD,
25675 IX86_BUILTIN_PMULHRW,
25677 /* 3DNow! Athlon Extensions */
25678 IX86_BUILTIN_PF2IW,
25679 IX86_BUILTIN_PFNACC,
25680 IX86_BUILTIN_PFPNACC,
25681 IX86_BUILTIN_PI2FW,
25682 IX86_BUILTIN_PSWAPDSI,
25683 IX86_BUILTIN_PSWAPDSF,
25685 /* SSE2 */
25686 IX86_BUILTIN_ADDPD,
25687 IX86_BUILTIN_ADDSD,
25688 IX86_BUILTIN_DIVPD,
25689 IX86_BUILTIN_DIVSD,
25690 IX86_BUILTIN_MULPD,
25691 IX86_BUILTIN_MULSD,
25692 IX86_BUILTIN_SUBPD,
25693 IX86_BUILTIN_SUBSD,
25695 IX86_BUILTIN_CMPEQPD,
25696 IX86_BUILTIN_CMPLTPD,
25697 IX86_BUILTIN_CMPLEPD,
25698 IX86_BUILTIN_CMPGTPD,
25699 IX86_BUILTIN_CMPGEPD,
25700 IX86_BUILTIN_CMPNEQPD,
25701 IX86_BUILTIN_CMPNLTPD,
25702 IX86_BUILTIN_CMPNLEPD,
25703 IX86_BUILTIN_CMPNGTPD,
25704 IX86_BUILTIN_CMPNGEPD,
25705 IX86_BUILTIN_CMPORDPD,
25706 IX86_BUILTIN_CMPUNORDPD,
25707 IX86_BUILTIN_CMPEQSD,
25708 IX86_BUILTIN_CMPLTSD,
25709 IX86_BUILTIN_CMPLESD,
25710 IX86_BUILTIN_CMPNEQSD,
25711 IX86_BUILTIN_CMPNLTSD,
25712 IX86_BUILTIN_CMPNLESD,
25713 IX86_BUILTIN_CMPORDSD,
25714 IX86_BUILTIN_CMPUNORDSD,
25716 IX86_BUILTIN_COMIEQSD,
25717 IX86_BUILTIN_COMILTSD,
25718 IX86_BUILTIN_COMILESD,
25719 IX86_BUILTIN_COMIGTSD,
25720 IX86_BUILTIN_COMIGESD,
25721 IX86_BUILTIN_COMINEQSD,
25722 IX86_BUILTIN_UCOMIEQSD,
25723 IX86_BUILTIN_UCOMILTSD,
25724 IX86_BUILTIN_UCOMILESD,
25725 IX86_BUILTIN_UCOMIGTSD,
25726 IX86_BUILTIN_UCOMIGESD,
25727 IX86_BUILTIN_UCOMINEQSD,
25729 IX86_BUILTIN_MAXPD,
25730 IX86_BUILTIN_MAXSD,
25731 IX86_BUILTIN_MINPD,
25732 IX86_BUILTIN_MINSD,
25734 IX86_BUILTIN_ANDPD,
25735 IX86_BUILTIN_ANDNPD,
25736 IX86_BUILTIN_ORPD,
25737 IX86_BUILTIN_XORPD,
25739 IX86_BUILTIN_SQRTPD,
25740 IX86_BUILTIN_SQRTSD,
25742 IX86_BUILTIN_UNPCKHPD,
25743 IX86_BUILTIN_UNPCKLPD,
25745 IX86_BUILTIN_SHUFPD,
25747 IX86_BUILTIN_LOADUPD,
25748 IX86_BUILTIN_STOREUPD,
25749 IX86_BUILTIN_MOVSD,
25751 IX86_BUILTIN_LOADHPD,
25752 IX86_BUILTIN_LOADLPD,
25754 IX86_BUILTIN_CVTDQ2PD,
25755 IX86_BUILTIN_CVTDQ2PS,
25757 IX86_BUILTIN_CVTPD2DQ,
25758 IX86_BUILTIN_CVTPD2PI,
25759 IX86_BUILTIN_CVTPD2PS,
25760 IX86_BUILTIN_CVTTPD2DQ,
25761 IX86_BUILTIN_CVTTPD2PI,
25763 IX86_BUILTIN_CVTPI2PD,
25764 IX86_BUILTIN_CVTSI2SD,
25765 IX86_BUILTIN_CVTSI642SD,
25767 IX86_BUILTIN_CVTSD2SI,
25768 IX86_BUILTIN_CVTSD2SI64,
25769 IX86_BUILTIN_CVTSD2SS,
25770 IX86_BUILTIN_CVTSS2SD,
25771 IX86_BUILTIN_CVTTSD2SI,
25772 IX86_BUILTIN_CVTTSD2SI64,
25774 IX86_BUILTIN_CVTPS2DQ,
25775 IX86_BUILTIN_CVTPS2PD,
25776 IX86_BUILTIN_CVTTPS2DQ,
25778 IX86_BUILTIN_MOVNTI,
25779 IX86_BUILTIN_MOVNTI64,
25780 IX86_BUILTIN_MOVNTPD,
25781 IX86_BUILTIN_MOVNTDQ,
25783 IX86_BUILTIN_MOVQ128,
25785 /* SSE2 MMX */
25786 IX86_BUILTIN_MASKMOVDQU,
25787 IX86_BUILTIN_MOVMSKPD,
25788 IX86_BUILTIN_PMOVMSKB128,
25790 IX86_BUILTIN_PACKSSWB128,
25791 IX86_BUILTIN_PACKSSDW128,
25792 IX86_BUILTIN_PACKUSWB128,
25794 IX86_BUILTIN_PADDB128,
25795 IX86_BUILTIN_PADDW128,
25796 IX86_BUILTIN_PADDD128,
25797 IX86_BUILTIN_PADDQ128,
25798 IX86_BUILTIN_PADDSB128,
25799 IX86_BUILTIN_PADDSW128,
25800 IX86_BUILTIN_PADDUSB128,
25801 IX86_BUILTIN_PADDUSW128,
25802 IX86_BUILTIN_PSUBB128,
25803 IX86_BUILTIN_PSUBW128,
25804 IX86_BUILTIN_PSUBD128,
25805 IX86_BUILTIN_PSUBQ128,
25806 IX86_BUILTIN_PSUBSB128,
25807 IX86_BUILTIN_PSUBSW128,
25808 IX86_BUILTIN_PSUBUSB128,
25809 IX86_BUILTIN_PSUBUSW128,
25811 IX86_BUILTIN_PAND128,
25812 IX86_BUILTIN_PANDN128,
25813 IX86_BUILTIN_POR128,
25814 IX86_BUILTIN_PXOR128,
25816 IX86_BUILTIN_PAVGB128,
25817 IX86_BUILTIN_PAVGW128,
25819 IX86_BUILTIN_PCMPEQB128,
25820 IX86_BUILTIN_PCMPEQW128,
25821 IX86_BUILTIN_PCMPEQD128,
25822 IX86_BUILTIN_PCMPGTB128,
25823 IX86_BUILTIN_PCMPGTW128,
25824 IX86_BUILTIN_PCMPGTD128,
25826 IX86_BUILTIN_PMADDWD128,
25828 IX86_BUILTIN_PMAXSW128,
25829 IX86_BUILTIN_PMAXUB128,
25830 IX86_BUILTIN_PMINSW128,
25831 IX86_BUILTIN_PMINUB128,
25833 IX86_BUILTIN_PMULUDQ,
25834 IX86_BUILTIN_PMULUDQ128,
25835 IX86_BUILTIN_PMULHUW128,
25836 IX86_BUILTIN_PMULHW128,
25837 IX86_BUILTIN_PMULLW128,
25839 IX86_BUILTIN_PSADBW128,
25840 IX86_BUILTIN_PSHUFHW,
25841 IX86_BUILTIN_PSHUFLW,
25842 IX86_BUILTIN_PSHUFD,
25844 IX86_BUILTIN_PSLLDQI128,
25845 IX86_BUILTIN_PSLLWI128,
25846 IX86_BUILTIN_PSLLDI128,
25847 IX86_BUILTIN_PSLLQI128,
25848 IX86_BUILTIN_PSRAWI128,
25849 IX86_BUILTIN_PSRADI128,
25850 IX86_BUILTIN_PSRLDQI128,
25851 IX86_BUILTIN_PSRLWI128,
25852 IX86_BUILTIN_PSRLDI128,
25853 IX86_BUILTIN_PSRLQI128,
25855 IX86_BUILTIN_PSLLDQ128,
25856 IX86_BUILTIN_PSLLW128,
25857 IX86_BUILTIN_PSLLD128,
25858 IX86_BUILTIN_PSLLQ128,
25859 IX86_BUILTIN_PSRAW128,
25860 IX86_BUILTIN_PSRAD128,
25861 IX86_BUILTIN_PSRLW128,
25862 IX86_BUILTIN_PSRLD128,
25863 IX86_BUILTIN_PSRLQ128,
25865 IX86_BUILTIN_PUNPCKHBW128,
25866 IX86_BUILTIN_PUNPCKHWD128,
25867 IX86_BUILTIN_PUNPCKHDQ128,
25868 IX86_BUILTIN_PUNPCKHQDQ128,
25869 IX86_BUILTIN_PUNPCKLBW128,
25870 IX86_BUILTIN_PUNPCKLWD128,
25871 IX86_BUILTIN_PUNPCKLDQ128,
25872 IX86_BUILTIN_PUNPCKLQDQ128,
25874 IX86_BUILTIN_CLFLUSH,
25875 IX86_BUILTIN_MFENCE,
25876 IX86_BUILTIN_LFENCE,
25877 IX86_BUILTIN_PAUSE,
25879 IX86_BUILTIN_BSRSI,
25880 IX86_BUILTIN_BSRDI,
25881 IX86_BUILTIN_RDPMC,
25882 IX86_BUILTIN_RDTSC,
25883 IX86_BUILTIN_RDTSCP,
25884 IX86_BUILTIN_ROLQI,
25885 IX86_BUILTIN_ROLHI,
25886 IX86_BUILTIN_RORQI,
25887 IX86_BUILTIN_RORHI,
25889 /* SSE3. */
25890 IX86_BUILTIN_ADDSUBPS,
25891 IX86_BUILTIN_HADDPS,
25892 IX86_BUILTIN_HSUBPS,
25893 IX86_BUILTIN_MOVSHDUP,
25894 IX86_BUILTIN_MOVSLDUP,
25895 IX86_BUILTIN_ADDSUBPD,
25896 IX86_BUILTIN_HADDPD,
25897 IX86_BUILTIN_HSUBPD,
25898 IX86_BUILTIN_LDDQU,
25900 IX86_BUILTIN_MONITOR,
25901 IX86_BUILTIN_MWAIT,
25903 /* SSSE3. */
25904 IX86_BUILTIN_PHADDW,
25905 IX86_BUILTIN_PHADDD,
25906 IX86_BUILTIN_PHADDSW,
25907 IX86_BUILTIN_PHSUBW,
25908 IX86_BUILTIN_PHSUBD,
25909 IX86_BUILTIN_PHSUBSW,
25910 IX86_BUILTIN_PMADDUBSW,
25911 IX86_BUILTIN_PMULHRSW,
25912 IX86_BUILTIN_PSHUFB,
25913 IX86_BUILTIN_PSIGNB,
25914 IX86_BUILTIN_PSIGNW,
25915 IX86_BUILTIN_PSIGND,
25916 IX86_BUILTIN_PALIGNR,
25917 IX86_BUILTIN_PABSB,
25918 IX86_BUILTIN_PABSW,
25919 IX86_BUILTIN_PABSD,
25921 IX86_BUILTIN_PHADDW128,
25922 IX86_BUILTIN_PHADDD128,
25923 IX86_BUILTIN_PHADDSW128,
25924 IX86_BUILTIN_PHSUBW128,
25925 IX86_BUILTIN_PHSUBD128,
25926 IX86_BUILTIN_PHSUBSW128,
25927 IX86_BUILTIN_PMADDUBSW128,
25928 IX86_BUILTIN_PMULHRSW128,
25929 IX86_BUILTIN_PSHUFB128,
25930 IX86_BUILTIN_PSIGNB128,
25931 IX86_BUILTIN_PSIGNW128,
25932 IX86_BUILTIN_PSIGND128,
25933 IX86_BUILTIN_PALIGNR128,
25934 IX86_BUILTIN_PABSB128,
25935 IX86_BUILTIN_PABSW128,
25936 IX86_BUILTIN_PABSD128,
25938 /* AMDFAM10 - SSE4A New Instructions. */
25939 IX86_BUILTIN_MOVNTSD,
25940 IX86_BUILTIN_MOVNTSS,
25941 IX86_BUILTIN_EXTRQI,
25942 IX86_BUILTIN_EXTRQ,
25943 IX86_BUILTIN_INSERTQI,
25944 IX86_BUILTIN_INSERTQ,
25946 /* SSE4.1. */
25947 IX86_BUILTIN_BLENDPD,
25948 IX86_BUILTIN_BLENDPS,
25949 IX86_BUILTIN_BLENDVPD,
25950 IX86_BUILTIN_BLENDVPS,
25951 IX86_BUILTIN_PBLENDVB128,
25952 IX86_BUILTIN_PBLENDW128,
25954 IX86_BUILTIN_DPPD,
25955 IX86_BUILTIN_DPPS,
25957 IX86_BUILTIN_INSERTPS128,
25959 IX86_BUILTIN_MOVNTDQA,
25960 IX86_BUILTIN_MPSADBW128,
25961 IX86_BUILTIN_PACKUSDW128,
25962 IX86_BUILTIN_PCMPEQQ,
25963 IX86_BUILTIN_PHMINPOSUW128,
25965 IX86_BUILTIN_PMAXSB128,
25966 IX86_BUILTIN_PMAXSD128,
25967 IX86_BUILTIN_PMAXUD128,
25968 IX86_BUILTIN_PMAXUW128,
25970 IX86_BUILTIN_PMINSB128,
25971 IX86_BUILTIN_PMINSD128,
25972 IX86_BUILTIN_PMINUD128,
25973 IX86_BUILTIN_PMINUW128,
25975 IX86_BUILTIN_PMOVSXBW128,
25976 IX86_BUILTIN_PMOVSXBD128,
25977 IX86_BUILTIN_PMOVSXBQ128,
25978 IX86_BUILTIN_PMOVSXWD128,
25979 IX86_BUILTIN_PMOVSXWQ128,
25980 IX86_BUILTIN_PMOVSXDQ128,
25982 IX86_BUILTIN_PMOVZXBW128,
25983 IX86_BUILTIN_PMOVZXBD128,
25984 IX86_BUILTIN_PMOVZXBQ128,
25985 IX86_BUILTIN_PMOVZXWD128,
25986 IX86_BUILTIN_PMOVZXWQ128,
25987 IX86_BUILTIN_PMOVZXDQ128,
25989 IX86_BUILTIN_PMULDQ128,
25990 IX86_BUILTIN_PMULLD128,
25992 IX86_BUILTIN_ROUNDSD,
25993 IX86_BUILTIN_ROUNDSS,
25995 IX86_BUILTIN_ROUNDPD,
25996 IX86_BUILTIN_ROUNDPS,
25998 IX86_BUILTIN_FLOORPD,
25999 IX86_BUILTIN_CEILPD,
26000 IX86_BUILTIN_TRUNCPD,
26001 IX86_BUILTIN_RINTPD,
26002 IX86_BUILTIN_ROUNDPD_AZ,
26004 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX,
26005 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX,
26006 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX,
26008 IX86_BUILTIN_FLOORPS,
26009 IX86_BUILTIN_CEILPS,
26010 IX86_BUILTIN_TRUNCPS,
26011 IX86_BUILTIN_RINTPS,
26012 IX86_BUILTIN_ROUNDPS_AZ,
26014 IX86_BUILTIN_FLOORPS_SFIX,
26015 IX86_BUILTIN_CEILPS_SFIX,
26016 IX86_BUILTIN_ROUNDPS_AZ_SFIX,
26018 IX86_BUILTIN_PTESTZ,
26019 IX86_BUILTIN_PTESTC,
26020 IX86_BUILTIN_PTESTNZC,
26022 IX86_BUILTIN_VEC_INIT_V2SI,
26023 IX86_BUILTIN_VEC_INIT_V4HI,
26024 IX86_BUILTIN_VEC_INIT_V8QI,
26025 IX86_BUILTIN_VEC_EXT_V2DF,
26026 IX86_BUILTIN_VEC_EXT_V2DI,
26027 IX86_BUILTIN_VEC_EXT_V4SF,
26028 IX86_BUILTIN_VEC_EXT_V4SI,
26029 IX86_BUILTIN_VEC_EXT_V8HI,
26030 IX86_BUILTIN_VEC_EXT_V2SI,
26031 IX86_BUILTIN_VEC_EXT_V4HI,
26032 IX86_BUILTIN_VEC_EXT_V16QI,
26033 IX86_BUILTIN_VEC_SET_V2DI,
26034 IX86_BUILTIN_VEC_SET_V4SF,
26035 IX86_BUILTIN_VEC_SET_V4SI,
26036 IX86_BUILTIN_VEC_SET_V8HI,
26037 IX86_BUILTIN_VEC_SET_V4HI,
26038 IX86_BUILTIN_VEC_SET_V16QI,
26040 IX86_BUILTIN_VEC_PACK_SFIX,
26041 IX86_BUILTIN_VEC_PACK_SFIX256,
26043 /* SSE4.2. */
26044 IX86_BUILTIN_CRC32QI,
26045 IX86_BUILTIN_CRC32HI,
26046 IX86_BUILTIN_CRC32SI,
26047 IX86_BUILTIN_CRC32DI,
26049 IX86_BUILTIN_PCMPESTRI128,
26050 IX86_BUILTIN_PCMPESTRM128,
26051 IX86_BUILTIN_PCMPESTRA128,
26052 IX86_BUILTIN_PCMPESTRC128,
26053 IX86_BUILTIN_PCMPESTRO128,
26054 IX86_BUILTIN_PCMPESTRS128,
26055 IX86_BUILTIN_PCMPESTRZ128,
26056 IX86_BUILTIN_PCMPISTRI128,
26057 IX86_BUILTIN_PCMPISTRM128,
26058 IX86_BUILTIN_PCMPISTRA128,
26059 IX86_BUILTIN_PCMPISTRC128,
26060 IX86_BUILTIN_PCMPISTRO128,
26061 IX86_BUILTIN_PCMPISTRS128,
26062 IX86_BUILTIN_PCMPISTRZ128,
26064 IX86_BUILTIN_PCMPGTQ,
26066 /* AES instructions */
26067 IX86_BUILTIN_AESENC128,
26068 IX86_BUILTIN_AESENCLAST128,
26069 IX86_BUILTIN_AESDEC128,
26070 IX86_BUILTIN_AESDECLAST128,
26071 IX86_BUILTIN_AESIMC128,
26072 IX86_BUILTIN_AESKEYGENASSIST128,
26074 /* PCLMUL instruction */
26075 IX86_BUILTIN_PCLMULQDQ128,
26077 /* AVX */
26078 IX86_BUILTIN_ADDPD256,
26079 IX86_BUILTIN_ADDPS256,
26080 IX86_BUILTIN_ADDSUBPD256,
26081 IX86_BUILTIN_ADDSUBPS256,
26082 IX86_BUILTIN_ANDPD256,
26083 IX86_BUILTIN_ANDPS256,
26084 IX86_BUILTIN_ANDNPD256,
26085 IX86_BUILTIN_ANDNPS256,
26086 IX86_BUILTIN_BLENDPD256,
26087 IX86_BUILTIN_BLENDPS256,
26088 IX86_BUILTIN_BLENDVPD256,
26089 IX86_BUILTIN_BLENDVPS256,
26090 IX86_BUILTIN_DIVPD256,
26091 IX86_BUILTIN_DIVPS256,
26092 IX86_BUILTIN_DPPS256,
26093 IX86_BUILTIN_HADDPD256,
26094 IX86_BUILTIN_HADDPS256,
26095 IX86_BUILTIN_HSUBPD256,
26096 IX86_BUILTIN_HSUBPS256,
26097 IX86_BUILTIN_MAXPD256,
26098 IX86_BUILTIN_MAXPS256,
26099 IX86_BUILTIN_MINPD256,
26100 IX86_BUILTIN_MINPS256,
26101 IX86_BUILTIN_MULPD256,
26102 IX86_BUILTIN_MULPS256,
26103 IX86_BUILTIN_ORPD256,
26104 IX86_BUILTIN_ORPS256,
26105 IX86_BUILTIN_SHUFPD256,
26106 IX86_BUILTIN_SHUFPS256,
26107 IX86_BUILTIN_SUBPD256,
26108 IX86_BUILTIN_SUBPS256,
26109 IX86_BUILTIN_XORPD256,
26110 IX86_BUILTIN_XORPS256,
26111 IX86_BUILTIN_CMPSD,
26112 IX86_BUILTIN_CMPSS,
26113 IX86_BUILTIN_CMPPD,
26114 IX86_BUILTIN_CMPPS,
26115 IX86_BUILTIN_CMPPD256,
26116 IX86_BUILTIN_CMPPS256,
26117 IX86_BUILTIN_CVTDQ2PD256,
26118 IX86_BUILTIN_CVTDQ2PS256,
26119 IX86_BUILTIN_CVTPD2PS256,
26120 IX86_BUILTIN_CVTPS2DQ256,
26121 IX86_BUILTIN_CVTPS2PD256,
26122 IX86_BUILTIN_CVTTPD2DQ256,
26123 IX86_BUILTIN_CVTPD2DQ256,
26124 IX86_BUILTIN_CVTTPS2DQ256,
26125 IX86_BUILTIN_EXTRACTF128PD256,
26126 IX86_BUILTIN_EXTRACTF128PS256,
26127 IX86_BUILTIN_EXTRACTF128SI256,
26128 IX86_BUILTIN_VZEROALL,
26129 IX86_BUILTIN_VZEROUPPER,
26130 IX86_BUILTIN_VPERMILVARPD,
26131 IX86_BUILTIN_VPERMILVARPS,
26132 IX86_BUILTIN_VPERMILVARPD256,
26133 IX86_BUILTIN_VPERMILVARPS256,
26134 IX86_BUILTIN_VPERMILPD,
26135 IX86_BUILTIN_VPERMILPS,
26136 IX86_BUILTIN_VPERMILPD256,
26137 IX86_BUILTIN_VPERMILPS256,
26138 IX86_BUILTIN_VPERMIL2PD,
26139 IX86_BUILTIN_VPERMIL2PS,
26140 IX86_BUILTIN_VPERMIL2PD256,
26141 IX86_BUILTIN_VPERMIL2PS256,
26142 IX86_BUILTIN_VPERM2F128PD256,
26143 IX86_BUILTIN_VPERM2F128PS256,
26144 IX86_BUILTIN_VPERM2F128SI256,
26145 IX86_BUILTIN_VBROADCASTSS,
26146 IX86_BUILTIN_VBROADCASTSD256,
26147 IX86_BUILTIN_VBROADCASTSS256,
26148 IX86_BUILTIN_VBROADCASTPD256,
26149 IX86_BUILTIN_VBROADCASTPS256,
26150 IX86_BUILTIN_VINSERTF128PD256,
26151 IX86_BUILTIN_VINSERTF128PS256,
26152 IX86_BUILTIN_VINSERTF128SI256,
26153 IX86_BUILTIN_LOADUPD256,
26154 IX86_BUILTIN_LOADUPS256,
26155 IX86_BUILTIN_STOREUPD256,
26156 IX86_BUILTIN_STOREUPS256,
26157 IX86_BUILTIN_LDDQU256,
26158 IX86_BUILTIN_MOVNTDQ256,
26159 IX86_BUILTIN_MOVNTPD256,
26160 IX86_BUILTIN_MOVNTPS256,
26161 IX86_BUILTIN_LOADDQU256,
26162 IX86_BUILTIN_STOREDQU256,
26163 IX86_BUILTIN_MASKLOADPD,
26164 IX86_BUILTIN_MASKLOADPS,
26165 IX86_BUILTIN_MASKSTOREPD,
26166 IX86_BUILTIN_MASKSTOREPS,
26167 IX86_BUILTIN_MASKLOADPD256,
26168 IX86_BUILTIN_MASKLOADPS256,
26169 IX86_BUILTIN_MASKSTOREPD256,
26170 IX86_BUILTIN_MASKSTOREPS256,
26171 IX86_BUILTIN_MOVSHDUP256,
26172 IX86_BUILTIN_MOVSLDUP256,
26173 IX86_BUILTIN_MOVDDUP256,
26175 IX86_BUILTIN_SQRTPD256,
26176 IX86_BUILTIN_SQRTPS256,
26177 IX86_BUILTIN_SQRTPS_NR256,
26178 IX86_BUILTIN_RSQRTPS256,
26179 IX86_BUILTIN_RSQRTPS_NR256,
26181 IX86_BUILTIN_RCPPS256,
26183 IX86_BUILTIN_ROUNDPD256,
26184 IX86_BUILTIN_ROUNDPS256,
26186 IX86_BUILTIN_FLOORPD256,
26187 IX86_BUILTIN_CEILPD256,
26188 IX86_BUILTIN_TRUNCPD256,
26189 IX86_BUILTIN_RINTPD256,
26190 IX86_BUILTIN_ROUNDPD_AZ256,
26192 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256,
26193 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256,
26194 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256,
26196 IX86_BUILTIN_FLOORPS256,
26197 IX86_BUILTIN_CEILPS256,
26198 IX86_BUILTIN_TRUNCPS256,
26199 IX86_BUILTIN_RINTPS256,
26200 IX86_BUILTIN_ROUNDPS_AZ256,
26202 IX86_BUILTIN_FLOORPS_SFIX256,
26203 IX86_BUILTIN_CEILPS_SFIX256,
26204 IX86_BUILTIN_ROUNDPS_AZ_SFIX256,
26206 IX86_BUILTIN_UNPCKHPD256,
26207 IX86_BUILTIN_UNPCKLPD256,
26208 IX86_BUILTIN_UNPCKHPS256,
26209 IX86_BUILTIN_UNPCKLPS256,
26211 IX86_BUILTIN_SI256_SI,
26212 IX86_BUILTIN_PS256_PS,
26213 IX86_BUILTIN_PD256_PD,
26214 IX86_BUILTIN_SI_SI256,
26215 IX86_BUILTIN_PS_PS256,
26216 IX86_BUILTIN_PD_PD256,
26218 IX86_BUILTIN_VTESTZPD,
26219 IX86_BUILTIN_VTESTCPD,
26220 IX86_BUILTIN_VTESTNZCPD,
26221 IX86_BUILTIN_VTESTZPS,
26222 IX86_BUILTIN_VTESTCPS,
26223 IX86_BUILTIN_VTESTNZCPS,
26224 IX86_BUILTIN_VTESTZPD256,
26225 IX86_BUILTIN_VTESTCPD256,
26226 IX86_BUILTIN_VTESTNZCPD256,
26227 IX86_BUILTIN_VTESTZPS256,
26228 IX86_BUILTIN_VTESTCPS256,
26229 IX86_BUILTIN_VTESTNZCPS256,
26230 IX86_BUILTIN_PTESTZ256,
26231 IX86_BUILTIN_PTESTC256,
26232 IX86_BUILTIN_PTESTNZC256,
26234 IX86_BUILTIN_MOVMSKPD256,
26235 IX86_BUILTIN_MOVMSKPS256,
26237 /* AVX2 */
26238 IX86_BUILTIN_MPSADBW256,
26239 IX86_BUILTIN_PABSB256,
26240 IX86_BUILTIN_PABSW256,
26241 IX86_BUILTIN_PABSD256,
26242 IX86_BUILTIN_PACKSSDW256,
26243 IX86_BUILTIN_PACKSSWB256,
26244 IX86_BUILTIN_PACKUSDW256,
26245 IX86_BUILTIN_PACKUSWB256,
26246 IX86_BUILTIN_PADDB256,
26247 IX86_BUILTIN_PADDW256,
26248 IX86_BUILTIN_PADDD256,
26249 IX86_BUILTIN_PADDQ256,
26250 IX86_BUILTIN_PADDSB256,
26251 IX86_BUILTIN_PADDSW256,
26252 IX86_BUILTIN_PADDUSB256,
26253 IX86_BUILTIN_PADDUSW256,
26254 IX86_BUILTIN_PALIGNR256,
26255 IX86_BUILTIN_AND256I,
26256 IX86_BUILTIN_ANDNOT256I,
26257 IX86_BUILTIN_PAVGB256,
26258 IX86_BUILTIN_PAVGW256,
26259 IX86_BUILTIN_PBLENDVB256,
26260 IX86_BUILTIN_PBLENDVW256,
26261 IX86_BUILTIN_PCMPEQB256,
26262 IX86_BUILTIN_PCMPEQW256,
26263 IX86_BUILTIN_PCMPEQD256,
26264 IX86_BUILTIN_PCMPEQQ256,
26265 IX86_BUILTIN_PCMPGTB256,
26266 IX86_BUILTIN_PCMPGTW256,
26267 IX86_BUILTIN_PCMPGTD256,
26268 IX86_BUILTIN_PCMPGTQ256,
26269 IX86_BUILTIN_PHADDW256,
26270 IX86_BUILTIN_PHADDD256,
26271 IX86_BUILTIN_PHADDSW256,
26272 IX86_BUILTIN_PHSUBW256,
26273 IX86_BUILTIN_PHSUBD256,
26274 IX86_BUILTIN_PHSUBSW256,
26275 IX86_BUILTIN_PMADDUBSW256,
26276 IX86_BUILTIN_PMADDWD256,
26277 IX86_BUILTIN_PMAXSB256,
26278 IX86_BUILTIN_PMAXSW256,
26279 IX86_BUILTIN_PMAXSD256,
26280 IX86_BUILTIN_PMAXUB256,
26281 IX86_BUILTIN_PMAXUW256,
26282 IX86_BUILTIN_PMAXUD256,
26283 IX86_BUILTIN_PMINSB256,
26284 IX86_BUILTIN_PMINSW256,
26285 IX86_BUILTIN_PMINSD256,
26286 IX86_BUILTIN_PMINUB256,
26287 IX86_BUILTIN_PMINUW256,
26288 IX86_BUILTIN_PMINUD256,
26289 IX86_BUILTIN_PMOVMSKB256,
26290 IX86_BUILTIN_PMOVSXBW256,
26291 IX86_BUILTIN_PMOVSXBD256,
26292 IX86_BUILTIN_PMOVSXBQ256,
26293 IX86_BUILTIN_PMOVSXWD256,
26294 IX86_BUILTIN_PMOVSXWQ256,
26295 IX86_BUILTIN_PMOVSXDQ256,
26296 IX86_BUILTIN_PMOVZXBW256,
26297 IX86_BUILTIN_PMOVZXBD256,
26298 IX86_BUILTIN_PMOVZXBQ256,
26299 IX86_BUILTIN_PMOVZXWD256,
26300 IX86_BUILTIN_PMOVZXWQ256,
26301 IX86_BUILTIN_PMOVZXDQ256,
26302 IX86_BUILTIN_PMULDQ256,
26303 IX86_BUILTIN_PMULHRSW256,
26304 IX86_BUILTIN_PMULHUW256,
26305 IX86_BUILTIN_PMULHW256,
26306 IX86_BUILTIN_PMULLW256,
26307 IX86_BUILTIN_PMULLD256,
26308 IX86_BUILTIN_PMULUDQ256,
26309 IX86_BUILTIN_POR256,
26310 IX86_BUILTIN_PSADBW256,
26311 IX86_BUILTIN_PSHUFB256,
26312 IX86_BUILTIN_PSHUFD256,
26313 IX86_BUILTIN_PSHUFHW256,
26314 IX86_BUILTIN_PSHUFLW256,
26315 IX86_BUILTIN_PSIGNB256,
26316 IX86_BUILTIN_PSIGNW256,
26317 IX86_BUILTIN_PSIGND256,
26318 IX86_BUILTIN_PSLLDQI256,
26319 IX86_BUILTIN_PSLLWI256,
26320 IX86_BUILTIN_PSLLW256,
26321 IX86_BUILTIN_PSLLDI256,
26322 IX86_BUILTIN_PSLLD256,
26323 IX86_BUILTIN_PSLLQI256,
26324 IX86_BUILTIN_PSLLQ256,
26325 IX86_BUILTIN_PSRAWI256,
26326 IX86_BUILTIN_PSRAW256,
26327 IX86_BUILTIN_PSRADI256,
26328 IX86_BUILTIN_PSRAD256,
26329 IX86_BUILTIN_PSRLDQI256,
26330 IX86_BUILTIN_PSRLWI256,
26331 IX86_BUILTIN_PSRLW256,
26332 IX86_BUILTIN_PSRLDI256,
26333 IX86_BUILTIN_PSRLD256,
26334 IX86_BUILTIN_PSRLQI256,
26335 IX86_BUILTIN_PSRLQ256,
26336 IX86_BUILTIN_PSUBB256,
26337 IX86_BUILTIN_PSUBW256,
26338 IX86_BUILTIN_PSUBD256,
26339 IX86_BUILTIN_PSUBQ256,
26340 IX86_BUILTIN_PSUBSB256,
26341 IX86_BUILTIN_PSUBSW256,
26342 IX86_BUILTIN_PSUBUSB256,
26343 IX86_BUILTIN_PSUBUSW256,
26344 IX86_BUILTIN_PUNPCKHBW256,
26345 IX86_BUILTIN_PUNPCKHWD256,
26346 IX86_BUILTIN_PUNPCKHDQ256,
26347 IX86_BUILTIN_PUNPCKHQDQ256,
26348 IX86_BUILTIN_PUNPCKLBW256,
26349 IX86_BUILTIN_PUNPCKLWD256,
26350 IX86_BUILTIN_PUNPCKLDQ256,
26351 IX86_BUILTIN_PUNPCKLQDQ256,
26352 IX86_BUILTIN_PXOR256,
26353 IX86_BUILTIN_MOVNTDQA256,
26354 IX86_BUILTIN_VBROADCASTSS_PS,
26355 IX86_BUILTIN_VBROADCASTSS_PS256,
26356 IX86_BUILTIN_VBROADCASTSD_PD256,
26357 IX86_BUILTIN_VBROADCASTSI256,
26358 IX86_BUILTIN_PBLENDD256,
26359 IX86_BUILTIN_PBLENDD128,
26360 IX86_BUILTIN_PBROADCASTB256,
26361 IX86_BUILTIN_PBROADCASTW256,
26362 IX86_BUILTIN_PBROADCASTD256,
26363 IX86_BUILTIN_PBROADCASTQ256,
26364 IX86_BUILTIN_PBROADCASTB128,
26365 IX86_BUILTIN_PBROADCASTW128,
26366 IX86_BUILTIN_PBROADCASTD128,
26367 IX86_BUILTIN_PBROADCASTQ128,
26368 IX86_BUILTIN_VPERMVARSI256,
26369 IX86_BUILTIN_VPERMDF256,
26370 IX86_BUILTIN_VPERMVARSF256,
26371 IX86_BUILTIN_VPERMDI256,
26372 IX86_BUILTIN_VPERMTI256,
26373 IX86_BUILTIN_VEXTRACT128I256,
26374 IX86_BUILTIN_VINSERT128I256,
26375 IX86_BUILTIN_MASKLOADD,
26376 IX86_BUILTIN_MASKLOADQ,
26377 IX86_BUILTIN_MASKLOADD256,
26378 IX86_BUILTIN_MASKLOADQ256,
26379 IX86_BUILTIN_MASKSTORED,
26380 IX86_BUILTIN_MASKSTOREQ,
26381 IX86_BUILTIN_MASKSTORED256,
26382 IX86_BUILTIN_MASKSTOREQ256,
26383 IX86_BUILTIN_PSLLVV4DI,
26384 IX86_BUILTIN_PSLLVV2DI,
26385 IX86_BUILTIN_PSLLVV8SI,
26386 IX86_BUILTIN_PSLLVV4SI,
26387 IX86_BUILTIN_PSRAVV8SI,
26388 IX86_BUILTIN_PSRAVV4SI,
26389 IX86_BUILTIN_PSRLVV4DI,
26390 IX86_BUILTIN_PSRLVV2DI,
26391 IX86_BUILTIN_PSRLVV8SI,
26392 IX86_BUILTIN_PSRLVV4SI,
26394 IX86_BUILTIN_GATHERSIV2DF,
26395 IX86_BUILTIN_GATHERSIV4DF,
26396 IX86_BUILTIN_GATHERDIV2DF,
26397 IX86_BUILTIN_GATHERDIV4DF,
26398 IX86_BUILTIN_GATHERSIV4SF,
26399 IX86_BUILTIN_GATHERSIV8SF,
26400 IX86_BUILTIN_GATHERDIV4SF,
26401 IX86_BUILTIN_GATHERDIV8SF,
26402 IX86_BUILTIN_GATHERSIV2DI,
26403 IX86_BUILTIN_GATHERSIV4DI,
26404 IX86_BUILTIN_GATHERDIV2DI,
26405 IX86_BUILTIN_GATHERDIV4DI,
26406 IX86_BUILTIN_GATHERSIV4SI,
26407 IX86_BUILTIN_GATHERSIV8SI,
26408 IX86_BUILTIN_GATHERDIV4SI,
26409 IX86_BUILTIN_GATHERDIV8SI,
26411 /* Alternate 4 element gather for the vectorizer where
26412 all operands are 32-byte wide. */
26413 IX86_BUILTIN_GATHERALTSIV4DF,
26414 IX86_BUILTIN_GATHERALTDIV8SF,
26415 IX86_BUILTIN_GATHERALTSIV4DI,
26416 IX86_BUILTIN_GATHERALTDIV8SI,
26418 /* TFmode support builtins. */
26419 IX86_BUILTIN_INFQ,
26420 IX86_BUILTIN_HUGE_VALQ,
26421 IX86_BUILTIN_FABSQ,
26422 IX86_BUILTIN_COPYSIGNQ,
26424 /* Vectorizer support builtins. */
26425 IX86_BUILTIN_CPYSGNPS,
26426 IX86_BUILTIN_CPYSGNPD,
26427 IX86_BUILTIN_CPYSGNPS256,
26428 IX86_BUILTIN_CPYSGNPD256,
26430 /* FMA4 instructions. */
26431 IX86_BUILTIN_VFMADDSS,
26432 IX86_BUILTIN_VFMADDSD,
26433 IX86_BUILTIN_VFMADDPS,
26434 IX86_BUILTIN_VFMADDPD,
26435 IX86_BUILTIN_VFMADDPS256,
26436 IX86_BUILTIN_VFMADDPD256,
26437 IX86_BUILTIN_VFMADDSUBPS,
26438 IX86_BUILTIN_VFMADDSUBPD,
26439 IX86_BUILTIN_VFMADDSUBPS256,
26440 IX86_BUILTIN_VFMADDSUBPD256,
26442 /* FMA3 instructions. */
26443 IX86_BUILTIN_VFMADDSS3,
26444 IX86_BUILTIN_VFMADDSD3,
26446 /* XOP instructions. */
26447 IX86_BUILTIN_VPCMOV,
26448 IX86_BUILTIN_VPCMOV_V2DI,
26449 IX86_BUILTIN_VPCMOV_V4SI,
26450 IX86_BUILTIN_VPCMOV_V8HI,
26451 IX86_BUILTIN_VPCMOV_V16QI,
26452 IX86_BUILTIN_VPCMOV_V4SF,
26453 IX86_BUILTIN_VPCMOV_V2DF,
26454 IX86_BUILTIN_VPCMOV256,
26455 IX86_BUILTIN_VPCMOV_V4DI256,
26456 IX86_BUILTIN_VPCMOV_V8SI256,
26457 IX86_BUILTIN_VPCMOV_V16HI256,
26458 IX86_BUILTIN_VPCMOV_V32QI256,
26459 IX86_BUILTIN_VPCMOV_V8SF256,
26460 IX86_BUILTIN_VPCMOV_V4DF256,
26462 IX86_BUILTIN_VPPERM,
26464 IX86_BUILTIN_VPMACSSWW,
26465 IX86_BUILTIN_VPMACSWW,
26466 IX86_BUILTIN_VPMACSSWD,
26467 IX86_BUILTIN_VPMACSWD,
26468 IX86_BUILTIN_VPMACSSDD,
26469 IX86_BUILTIN_VPMACSDD,
26470 IX86_BUILTIN_VPMACSSDQL,
26471 IX86_BUILTIN_VPMACSSDQH,
26472 IX86_BUILTIN_VPMACSDQL,
26473 IX86_BUILTIN_VPMACSDQH,
26474 IX86_BUILTIN_VPMADCSSWD,
26475 IX86_BUILTIN_VPMADCSWD,
26477 IX86_BUILTIN_VPHADDBW,
26478 IX86_BUILTIN_VPHADDBD,
26479 IX86_BUILTIN_VPHADDBQ,
26480 IX86_BUILTIN_VPHADDWD,
26481 IX86_BUILTIN_VPHADDWQ,
26482 IX86_BUILTIN_VPHADDDQ,
26483 IX86_BUILTIN_VPHADDUBW,
26484 IX86_BUILTIN_VPHADDUBD,
26485 IX86_BUILTIN_VPHADDUBQ,
26486 IX86_BUILTIN_VPHADDUWD,
26487 IX86_BUILTIN_VPHADDUWQ,
26488 IX86_BUILTIN_VPHADDUDQ,
26489 IX86_BUILTIN_VPHSUBBW,
26490 IX86_BUILTIN_VPHSUBWD,
26491 IX86_BUILTIN_VPHSUBDQ,
26493 IX86_BUILTIN_VPROTB,
26494 IX86_BUILTIN_VPROTW,
26495 IX86_BUILTIN_VPROTD,
26496 IX86_BUILTIN_VPROTQ,
26497 IX86_BUILTIN_VPROTB_IMM,
26498 IX86_BUILTIN_VPROTW_IMM,
26499 IX86_BUILTIN_VPROTD_IMM,
26500 IX86_BUILTIN_VPROTQ_IMM,
26502 IX86_BUILTIN_VPSHLB,
26503 IX86_BUILTIN_VPSHLW,
26504 IX86_BUILTIN_VPSHLD,
26505 IX86_BUILTIN_VPSHLQ,
26506 IX86_BUILTIN_VPSHAB,
26507 IX86_BUILTIN_VPSHAW,
26508 IX86_BUILTIN_VPSHAD,
26509 IX86_BUILTIN_VPSHAQ,
26511 IX86_BUILTIN_VFRCZSS,
26512 IX86_BUILTIN_VFRCZSD,
26513 IX86_BUILTIN_VFRCZPS,
26514 IX86_BUILTIN_VFRCZPD,
26515 IX86_BUILTIN_VFRCZPS256,
26516 IX86_BUILTIN_VFRCZPD256,
26518 IX86_BUILTIN_VPCOMEQUB,
26519 IX86_BUILTIN_VPCOMNEUB,
26520 IX86_BUILTIN_VPCOMLTUB,
26521 IX86_BUILTIN_VPCOMLEUB,
26522 IX86_BUILTIN_VPCOMGTUB,
26523 IX86_BUILTIN_VPCOMGEUB,
26524 IX86_BUILTIN_VPCOMFALSEUB,
26525 IX86_BUILTIN_VPCOMTRUEUB,
26527 IX86_BUILTIN_VPCOMEQUW,
26528 IX86_BUILTIN_VPCOMNEUW,
26529 IX86_BUILTIN_VPCOMLTUW,
26530 IX86_BUILTIN_VPCOMLEUW,
26531 IX86_BUILTIN_VPCOMGTUW,
26532 IX86_BUILTIN_VPCOMGEUW,
26533 IX86_BUILTIN_VPCOMFALSEUW,
26534 IX86_BUILTIN_VPCOMTRUEUW,
26536 IX86_BUILTIN_VPCOMEQUD,
26537 IX86_BUILTIN_VPCOMNEUD,
26538 IX86_BUILTIN_VPCOMLTUD,
26539 IX86_BUILTIN_VPCOMLEUD,
26540 IX86_BUILTIN_VPCOMGTUD,
26541 IX86_BUILTIN_VPCOMGEUD,
26542 IX86_BUILTIN_VPCOMFALSEUD,
26543 IX86_BUILTIN_VPCOMTRUEUD,
26545 IX86_BUILTIN_VPCOMEQUQ,
26546 IX86_BUILTIN_VPCOMNEUQ,
26547 IX86_BUILTIN_VPCOMLTUQ,
26548 IX86_BUILTIN_VPCOMLEUQ,
26549 IX86_BUILTIN_VPCOMGTUQ,
26550 IX86_BUILTIN_VPCOMGEUQ,
26551 IX86_BUILTIN_VPCOMFALSEUQ,
26552 IX86_BUILTIN_VPCOMTRUEUQ,
26554 IX86_BUILTIN_VPCOMEQB,
26555 IX86_BUILTIN_VPCOMNEB,
26556 IX86_BUILTIN_VPCOMLTB,
26557 IX86_BUILTIN_VPCOMLEB,
26558 IX86_BUILTIN_VPCOMGTB,
26559 IX86_BUILTIN_VPCOMGEB,
26560 IX86_BUILTIN_VPCOMFALSEB,
26561 IX86_BUILTIN_VPCOMTRUEB,
26563 IX86_BUILTIN_VPCOMEQW,
26564 IX86_BUILTIN_VPCOMNEW,
26565 IX86_BUILTIN_VPCOMLTW,
26566 IX86_BUILTIN_VPCOMLEW,
26567 IX86_BUILTIN_VPCOMGTW,
26568 IX86_BUILTIN_VPCOMGEW,
26569 IX86_BUILTIN_VPCOMFALSEW,
26570 IX86_BUILTIN_VPCOMTRUEW,
26572 IX86_BUILTIN_VPCOMEQD,
26573 IX86_BUILTIN_VPCOMNED,
26574 IX86_BUILTIN_VPCOMLTD,
26575 IX86_BUILTIN_VPCOMLED,
26576 IX86_BUILTIN_VPCOMGTD,
26577 IX86_BUILTIN_VPCOMGED,
26578 IX86_BUILTIN_VPCOMFALSED,
26579 IX86_BUILTIN_VPCOMTRUED,
26581 IX86_BUILTIN_VPCOMEQQ,
26582 IX86_BUILTIN_VPCOMNEQ,
26583 IX86_BUILTIN_VPCOMLTQ,
26584 IX86_BUILTIN_VPCOMLEQ,
26585 IX86_BUILTIN_VPCOMGTQ,
26586 IX86_BUILTIN_VPCOMGEQ,
26587 IX86_BUILTIN_VPCOMFALSEQ,
26588 IX86_BUILTIN_VPCOMTRUEQ,
26590 /* LWP instructions. */
26591 IX86_BUILTIN_LLWPCB,
26592 IX86_BUILTIN_SLWPCB,
26593 IX86_BUILTIN_LWPVAL32,
26594 IX86_BUILTIN_LWPVAL64,
26595 IX86_BUILTIN_LWPINS32,
26596 IX86_BUILTIN_LWPINS64,
26598 IX86_BUILTIN_CLZS,
26600 /* RTM */
26601 IX86_BUILTIN_XBEGIN,
26602 IX86_BUILTIN_XEND,
26603 IX86_BUILTIN_XABORT,
26604 IX86_BUILTIN_XTEST,
26606 /* BMI instructions. */
26607 IX86_BUILTIN_BEXTR32,
26608 IX86_BUILTIN_BEXTR64,
26609 IX86_BUILTIN_CTZS,
26611 /* TBM instructions. */
26612 IX86_BUILTIN_BEXTRI32,
26613 IX86_BUILTIN_BEXTRI64,
26615 /* BMI2 instructions. */
26616 IX86_BUILTIN_BZHI32,
26617 IX86_BUILTIN_BZHI64,
26618 IX86_BUILTIN_PDEP32,
26619 IX86_BUILTIN_PDEP64,
26620 IX86_BUILTIN_PEXT32,
26621 IX86_BUILTIN_PEXT64,
26623 /* ADX instructions. */
26624 IX86_BUILTIN_ADDCARRYX32,
26625 IX86_BUILTIN_ADDCARRYX64,
26627 /* FSGSBASE instructions. */
26628 IX86_BUILTIN_RDFSBASE32,
26629 IX86_BUILTIN_RDFSBASE64,
26630 IX86_BUILTIN_RDGSBASE32,
26631 IX86_BUILTIN_RDGSBASE64,
26632 IX86_BUILTIN_WRFSBASE32,
26633 IX86_BUILTIN_WRFSBASE64,
26634 IX86_BUILTIN_WRGSBASE32,
26635 IX86_BUILTIN_WRGSBASE64,
26637 /* RDRND instructions. */
26638 IX86_BUILTIN_RDRAND16_STEP,
26639 IX86_BUILTIN_RDRAND32_STEP,
26640 IX86_BUILTIN_RDRAND64_STEP,
26642 /* RDSEED instructions. */
26643 IX86_BUILTIN_RDSEED16_STEP,
26644 IX86_BUILTIN_RDSEED32_STEP,
26645 IX86_BUILTIN_RDSEED64_STEP,
26647 /* F16C instructions. */
26648 IX86_BUILTIN_CVTPH2PS,
26649 IX86_BUILTIN_CVTPH2PS256,
26650 IX86_BUILTIN_CVTPS2PH,
26651 IX86_BUILTIN_CVTPS2PH256,
26653 /* CFString built-in for darwin */
26654 IX86_BUILTIN_CFSTRING,
26656 /* Builtins to get CPU type and supported features. */
26657 IX86_BUILTIN_CPU_INIT,
26658 IX86_BUILTIN_CPU_IS,
26659 IX86_BUILTIN_CPU_SUPPORTS,
26661 IX86_BUILTIN_MAX
26664 /* Table for the ix86 builtin decls. */
26665 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
26667 /* Table of all of the builtin functions that are possible with different ISA's
26668 but are waiting to be built until a function is declared to use that
26669 ISA. */
26670 struct builtin_isa {
26671 const char *name; /* function name */
26672 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
26673 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
26674 bool const_p; /* true if the declaration is constant */
26675 bool set_and_not_built_p;
26678 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
26681 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
26682 of which isa_flags to use in the ix86_builtins_isa array. Stores the
26683 function decl in the ix86_builtins array. Returns the function decl or
26684 NULL_TREE, if the builtin was not added.
26686 If the front end has a special hook for builtin functions, delay adding
26687 builtin functions that aren't in the current ISA until the ISA is changed
26688 with function specific optimization. Doing so, can save about 300K for the
26689 default compiler. When the builtin is expanded, check at that time whether
26690 it is valid.
26692 If the front end doesn't have a special hook, record all builtins, even if
26693 it isn't an instruction set in the current ISA in case the user uses
26694 function specific options for a different ISA, so that we don't get scope
26695 errors if a builtin is added in the middle of a function scope. */
26697 static inline tree
26698 def_builtin (HOST_WIDE_INT mask, const char *name,
26699 enum ix86_builtin_func_type tcode,
26700 enum ix86_builtins code)
26702 tree decl = NULL_TREE;
26704 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
26706 ix86_builtins_isa[(int) code].isa = mask;
26708 mask &= ~OPTION_MASK_ISA_64BIT;
26709 if (mask == 0
26710 || (mask & ix86_isa_flags) != 0
26711 || (lang_hooks.builtin_function
26712 == lang_hooks.builtin_function_ext_scope))
26715 tree type = ix86_get_builtin_func_type (tcode);
26716 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
26717 NULL, NULL_TREE);
26718 ix86_builtins[(int) code] = decl;
26719 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
26721 else
26723 ix86_builtins[(int) code] = NULL_TREE;
26724 ix86_builtins_isa[(int) code].tcode = tcode;
26725 ix86_builtins_isa[(int) code].name = name;
26726 ix86_builtins_isa[(int) code].const_p = false;
26727 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
26731 return decl;
26734 /* Like def_builtin, but also marks the function decl "const". */
26736 static inline tree
26737 def_builtin_const (HOST_WIDE_INT mask, const char *name,
26738 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
26740 tree decl = def_builtin (mask, name, tcode, code);
26741 if (decl)
26742 TREE_READONLY (decl) = 1;
26743 else
26744 ix86_builtins_isa[(int) code].const_p = true;
26746 return decl;
26749 /* Add any new builtin functions for a given ISA that may not have been
26750 declared. This saves a bit of space compared to adding all of the
26751 declarations to the tree, even if we didn't use them. */
26753 static void
26754 ix86_add_new_builtins (HOST_WIDE_INT isa)
26756 int i;
26758 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
26760 if ((ix86_builtins_isa[i].isa & isa) != 0
26761 && ix86_builtins_isa[i].set_and_not_built_p)
26763 tree decl, type;
26765 /* Don't define the builtin again. */
26766 ix86_builtins_isa[i].set_and_not_built_p = false;
26768 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
26769 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
26770 type, i, BUILT_IN_MD, NULL,
26771 NULL_TREE);
26773 ix86_builtins[i] = decl;
26774 if (ix86_builtins_isa[i].const_p)
26775 TREE_READONLY (decl) = 1;
26780 /* Bits for builtin_description.flag. */
26782 /* Set when we don't support the comparison natively, and should
26783 swap_comparison in order to support it. */
26784 #define BUILTIN_DESC_SWAP_OPERANDS 1
26786 struct builtin_description
26788 const HOST_WIDE_INT mask;
26789 const enum insn_code icode;
26790 const char *const name;
26791 const enum ix86_builtins code;
26792 const enum rtx_code comparison;
26793 const int flag;
26796 static const struct builtin_description bdesc_comi[] =
26798 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
26799 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
26800 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
26801 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
26802 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
26803 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
26804 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
26805 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
26806 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
26807 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
26808 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
26809 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
26810 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
26811 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
26812 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
26813 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
26814 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
26815 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
26816 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
26817 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
26818 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
26819 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
26820 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
26821 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
26824 static const struct builtin_description bdesc_pcmpestr[] =
26826 /* SSE4.2 */
26827 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
26828 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
26829 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
26830 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
26831 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
26832 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
26833 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
26836 static const struct builtin_description bdesc_pcmpistr[] =
26838 /* SSE4.2 */
26839 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
26840 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
26841 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
26842 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
26843 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
26844 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
26845 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
26848 /* Special builtins with variable number of arguments. */
26849 static const struct builtin_description bdesc_special_args[] =
26851 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
26852 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
26853 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
26855 /* MMX */
26856 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26858 /* 3DNow! */
26859 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26861 /* FXSR, XSAVE and XSAVEOPT */
26862 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxsave", IX86_BUILTIN_FXSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID },
26863 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxrstor", IX86_BUILTIN_FXRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID },
26864 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xsave", IX86_BUILTIN_XSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26865 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xrstor", IX86_BUILTIN_XRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26866 { OPTION_MASK_ISA_XSAVEOPT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt", IX86_BUILTIN_XSAVEOPT, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26868 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxsave64", IX86_BUILTIN_FXSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID },
26869 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxrstor64", IX86_BUILTIN_FXRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID },
26870 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsave64", IX86_BUILTIN_XSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26871 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xrstor64", IX86_BUILTIN_XRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26872 { OPTION_MASK_ISA_XSAVEOPT | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt64", IX86_BUILTIN_XSAVEOPT64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26874 /* SSE */
26875 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storeups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26876 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26877 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
26879 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
26880 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
26881 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
26882 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
26884 /* SSE or 3DNow!A */
26885 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26886 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
26888 /* SSE2 */
26889 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26890 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26891 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storeupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26892 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storedqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
26893 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26894 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
26895 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
26896 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
26897 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
26898 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loaddqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
26900 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
26901 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
26903 /* SSE3 */
26904 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
26906 /* SSE4.1 */
26907 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
26909 /* SSE4A */
26910 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26911 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26913 /* AVX */
26914 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
26915 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
26917 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
26918 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26919 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26920 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
26921 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
26923 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26924 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26925 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26926 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26927 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loaddqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26928 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storedqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
26929 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26931 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
26932 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26933 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26935 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
26936 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
26937 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
26938 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
26939 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
26940 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
26941 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
26942 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
26944 /* AVX2 */
26945 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
26946 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
26947 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
26948 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
26949 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
26950 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
26951 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
26952 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
26953 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
26955 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
26956 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
26957 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
26958 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
26959 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
26960 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
26962 /* FSGSBASE */
26963 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26964 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26965 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26966 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26967 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26968 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26969 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26970 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26972 /* RTM */
26973 { OPTION_MASK_ISA_RTM, CODE_FOR_xbegin, "__builtin_ia32_xbegin", IX86_BUILTIN_XBEGIN, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26974 { OPTION_MASK_ISA_RTM, CODE_FOR_xend, "__builtin_ia32_xend", IX86_BUILTIN_XEND, UNKNOWN, (int) VOID_FTYPE_VOID },
26975 { OPTION_MASK_ISA_RTM, CODE_FOR_xtest, "__builtin_ia32_xtest", IX86_BUILTIN_XTEST, UNKNOWN, (int) INT_FTYPE_VOID },
26978 /* Builtins with variable number of arguments. */
26979 static const struct builtin_description bdesc_args[] =
26981 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
26982 { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
26983 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
26984 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26985 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26986 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26987 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26989 /* MMX */
26990 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26991 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26992 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26993 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26994 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26995 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26997 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26998 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26999 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27000 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27001 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27002 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27003 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27004 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27006 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27007 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27009 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27010 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27011 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27012 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27014 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27015 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27016 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27017 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27018 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27019 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27021 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27022 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27023 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27024 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27025 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
27026 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
27028 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
27029 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
27030 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
27032 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
27034 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27035 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27036 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
27037 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27038 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27039 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
27041 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27042 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27043 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
27044 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27045 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27046 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
27048 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27049 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27050 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27051 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27053 /* 3DNow! */
27054 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
27055 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
27056 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27057 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27059 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27060 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27061 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27062 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27063 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27064 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27065 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27066 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27067 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27068 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27069 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27070 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27071 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27072 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27073 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27075 /* 3DNow!A */
27076 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
27077 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
27078 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
27079 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27080 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27081 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27083 /* SSE */
27084 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
27085 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27086 { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27087 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27088 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27089 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27090 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
27091 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
27092 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
27093 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
27094 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
27095 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
27097 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27099 { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27100 { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27101 { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27102 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27103 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27104 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27105 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27106 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27108 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
27109 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
27110 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
27111 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27112 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27113 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27114 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
27115 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
27116 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
27117 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27118 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
27119 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27120 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
27121 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
27122 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
27123 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27124 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
27125 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
27126 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
27127 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngtss", IX86_BUILTIN_CMPNGTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27128 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngess", IX86_BUILTIN_CMPNGESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27129 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27131 { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27132 { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27133 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27134 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27136 { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27137 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27138 { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27139 { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27141 { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27143 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27144 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27145 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27146 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27147 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27149 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
27150 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
27151 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
27153 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
27155 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27156 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27157 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27159 { OPTION_MASK_ISA_SSE, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
27160 { OPTION_MASK_ISA_SSE, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
27162 /* SSE MMX or 3Dnow!A */
27163 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27164 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27165 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27167 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27168 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27169 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27170 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27172 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
27173 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
27175 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
27177 /* SSE2 */
27178 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27180 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
27181 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
27182 { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
27183 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
27184 { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
27186 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
27187 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
27188 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
27189 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
27190 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
27192 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
27194 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
27195 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
27196 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
27197 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
27199 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2dq, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27200 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
27201 { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27203 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27204 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27205 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27206 { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27207 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27208 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27209 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27210 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27212 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
27213 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
27214 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
27215 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27216 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
27217 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27218 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
27219 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
27220 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
27221 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27222 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27223 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27224 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
27225 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
27226 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
27227 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27228 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
27229 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
27230 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
27231 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27233 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27234 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27235 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27236 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27238 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27239 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27240 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27241 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27243 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27245 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27246 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27247 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27249 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
27251 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27252 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27253 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27254 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27255 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27256 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27257 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27258 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27260 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27261 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27262 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27263 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27264 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27265 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27266 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27267 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27269 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27270 { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
27272 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27273 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27274 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27275 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27277 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27278 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27280 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27281 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27282 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27283 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27284 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27285 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27287 { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27288 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27289 { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27290 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27292 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27293 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27294 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27295 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27296 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27297 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27298 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27299 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27301 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
27302 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
27303 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
27305 { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27306 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
27308 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
27309 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_widen_umult_even_v4si, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
27311 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
27313 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
27314 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
27315 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
27316 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
27318 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
27319 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27320 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27321 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
27322 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27323 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27324 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
27326 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
27327 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27328 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27329 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
27330 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27331 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27332 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
27334 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27335 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27336 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27337 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27339 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
27340 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
27341 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
27343 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
27345 { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27347 /* SSE2 MMX */
27348 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
27349 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
27351 /* SSE3 */
27352 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
27353 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27355 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27356 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27357 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27358 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27359 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27360 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27362 /* SSSE3 */
27363 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
27364 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
27365 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27366 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
27367 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
27368 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
27370 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27371 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27372 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27373 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27374 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27375 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27376 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27377 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27378 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27379 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27380 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27381 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27382 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
27383 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
27384 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27385 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27386 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27387 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27388 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27389 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27390 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27391 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27392 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27393 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27395 /* SSSE3. */
27396 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
27397 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
27399 /* SSE4.1 */
27400 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27401 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27402 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
27403 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
27404 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27405 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27406 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27407 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
27408 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
27409 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
27411 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
27412 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
27413 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
27414 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
27415 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
27416 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
27417 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
27418 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
27419 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
27420 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
27421 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
27422 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
27423 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27425 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
27426 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27427 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27428 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27429 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27430 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27431 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27432 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27433 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27434 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27435 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
27436 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27438 /* SSE4.1 */
27439 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
27440 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
27441 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27442 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27444 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
27445 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
27446 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
27447 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
27449 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
27450 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
27452 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
27453 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
27455 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
27456 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
27457 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
27458 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
27460 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND },
27461 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND },
27463 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27464 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27466 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27467 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27468 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27470 /* SSE4.2 */
27471 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27472 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
27473 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
27474 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27475 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27477 /* SSE4A */
27478 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
27479 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
27480 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
27481 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27483 /* AES */
27484 { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
27485 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27487 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27488 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27489 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27490 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27492 /* PCLMUL */
27493 { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
27495 /* AVX */
27496 { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27497 { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27498 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27499 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27500 { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27501 { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27502 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27503 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27504 { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27505 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27506 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27507 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27508 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27509 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27510 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27511 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27512 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27513 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27514 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27515 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27516 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27517 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27518 { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27519 { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27520 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27521 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27523 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
27524 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
27525 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
27526 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
27528 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27529 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27530 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
27531 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
27532 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27533 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27534 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27535 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27536 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27537 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27538 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27539 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27540 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27541 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
27542 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
27543 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
27544 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
27545 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
27546 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
27547 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2dq256, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27548 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
27549 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
27550 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
27551 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27552 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27553 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27554 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
27555 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
27556 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
27557 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27558 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
27559 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
27560 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
27561 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
27563 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27564 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27565 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27567 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27568 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27569 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27570 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27571 { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27573 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27575 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27576 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
27578 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
27579 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
27580 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
27581 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
27583 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27584 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
27586 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
27587 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
27589 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
27590 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
27591 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
27592 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
27594 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND },
27595 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
27597 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27598 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27600 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27601 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27602 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27603 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27605 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
27606 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
27607 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
27608 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
27609 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
27610 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
27612 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27613 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27614 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27615 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27616 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27617 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27618 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27619 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27620 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27621 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27622 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27623 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27624 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27625 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27626 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27628 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
27629 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
27631 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27632 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27634 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
27636 /* AVX2 */
27637 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
27638 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
27639 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
27640 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
27641 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256", IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
27642 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256", IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
27643 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256", IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
27644 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256", IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
27645 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27646 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27647 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27648 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27649 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27650 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27651 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27652 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27653 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
27654 { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27655 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27656 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27657 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27658 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
27659 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
27660 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27661 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27662 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27663 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27664 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27665 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27666 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27667 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27668 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27669 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27670 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27671 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27672 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27673 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27674 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
27675 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
27676 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27677 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27678 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27679 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27680 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27681 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27682 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27683 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27684 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27685 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27686 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27687 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27688 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
27689 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
27690 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2 , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
27691 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2 , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
27692 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2 , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
27693 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2 , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
27694 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2 , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
27695 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
27696 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2 , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
27697 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2 , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
27698 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
27699 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
27700 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
27701 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_smult_even_v8si, "__builtin_ia32_pmuldq256", IX86_BUILTIN_PMULDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
27702 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27703 { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27704 { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27705 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27706 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27707 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_umult_even_v8si, "__builtin_ia32_pmuludq256", IX86_BUILTIN_PMULUDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
27708 { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27709 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
27710 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27711 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
27712 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
27713 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
27714 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27715 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27716 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27717 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
27718 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27719 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27720 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27721 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27722 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
27723 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
27724 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27725 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27726 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27727 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27728 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
27729 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27730 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27731 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27732 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27733 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
27734 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
27735 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27736 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27737 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27738 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27739 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27740 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27741 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27742 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27743 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27744 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27745 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27746 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27747 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27748 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27749 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27750 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27751 { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27752 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27753 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
27754 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
27755 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
27756 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
27757 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
27758 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
27759 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
27760 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
27761 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
27762 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
27763 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27764 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
27765 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27766 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27767 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
27768 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27769 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
27770 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
27771 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
27772 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
27773 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27774 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27775 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27776 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27777 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27778 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27779 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27780 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27781 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27782 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27784 { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
27786 /* BMI */
27787 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27788 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27789 { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
27791 /* TBM */
27792 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27793 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27795 /* F16C */
27796 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
27797 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
27798 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
27799 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
27801 /* BMI2 */
27802 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27803 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27804 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27805 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27806 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27807 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27810 /* FMA4 and XOP. */
27811 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
27812 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
27813 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
27814 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
27815 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
27816 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
27817 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
27818 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
27819 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
27820 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
27821 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
27822 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
27823 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
27824 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
27825 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
27826 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
27827 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
27828 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
27829 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
27830 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
27831 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
27832 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
27833 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
27834 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
27835 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
27836 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
27837 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
27838 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
27839 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
27840 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
27841 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
27842 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
27843 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
27844 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
27845 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
27846 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
27847 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
27848 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
27849 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
27850 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
27851 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
27852 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
27853 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
27854 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
27855 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
27856 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
27857 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
27858 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
27859 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
27860 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
27861 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
27862 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
27864 static const struct builtin_description bdesc_multi_arg[] =
27866 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
27867 "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
27868 UNKNOWN, (int)MULTI_ARG_3_SF },
27869 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
27870 "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
27871 UNKNOWN, (int)MULTI_ARG_3_DF },
27873 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
27874 "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
27875 UNKNOWN, (int)MULTI_ARG_3_SF },
27876 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
27877 "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
27878 UNKNOWN, (int)MULTI_ARG_3_DF },
27880 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
27881 "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
27882 UNKNOWN, (int)MULTI_ARG_3_SF },
27883 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
27884 "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
27885 UNKNOWN, (int)MULTI_ARG_3_DF },
27886 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
27887 "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
27888 UNKNOWN, (int)MULTI_ARG_3_SF2 },
27889 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
27890 "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
27891 UNKNOWN, (int)MULTI_ARG_3_DF2 },
27893 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
27894 "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
27895 UNKNOWN, (int)MULTI_ARG_3_SF },
27896 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
27897 "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
27898 UNKNOWN, (int)MULTI_ARG_3_DF },
27899 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
27900 "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
27901 UNKNOWN, (int)MULTI_ARG_3_SF2 },
27902 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
27903 "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
27904 UNKNOWN, (int)MULTI_ARG_3_DF2 },
27906 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI },
27907 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI },
27908 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI },
27909 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI },
27910 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI },
27911 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF },
27912 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF },
27914 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
27915 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
27916 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 },
27917 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 },
27918 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 },
27919 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
27920 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
27922 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI },
27924 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
27925 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
27926 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27927 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27928 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27929 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27930 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27931 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27932 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27933 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27934 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27935 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27937 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27938 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI },
27939 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI },
27940 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI },
27941 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM },
27942 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM },
27943 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM },
27944 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM },
27945 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27946 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI },
27947 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI },
27948 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI },
27949 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27950 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI },
27951 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI },
27952 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI },
27954 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_2_SF },
27955 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_2_DF },
27956 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF },
27957 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF },
27958 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 },
27959 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 },
27961 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27962 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27963 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27964 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27965 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27966 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27967 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27968 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27969 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27970 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27971 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27972 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27973 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27974 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27975 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27977 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP },
27978 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27979 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27980 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP },
27981 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP },
27982 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP },
27983 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP },
27985 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP },
27986 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
27987 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
27988 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP },
27989 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP },
27990 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP },
27991 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP },
27993 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP },
27994 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
27995 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
27996 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP },
27997 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP },
27998 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP },
27999 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP },
28001 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP },
28002 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
28003 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
28004 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP },
28005 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP },
28006 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP },
28007 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP },
28009 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP },
28010 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
28011 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
28012 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP },
28013 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP },
28014 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP },
28015 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP },
28017 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP },
28018 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
28019 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
28020 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP },
28021 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP },
28022 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP },
28023 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP },
28025 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP },
28026 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
28027 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
28028 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP },
28029 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP },
28030 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP },
28031 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP },
28033 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP },
28034 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
28035 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
28036 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP },
28037 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP },
28038 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP },
28039 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP },
28041 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
28042 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
28043 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
28044 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
28045 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
28046 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
28047 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
28048 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
28050 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
28051 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
28052 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
28053 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
28054 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
28055 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
28056 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
28057 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
28059 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
28060 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
28061 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
28062 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
28066 /* TM vector builtins. */
28068 /* Reuse the existing x86-specific `struct builtin_description' cause
28069 we're lazy. Add casts to make them fit. */
28070 static const struct builtin_description bdesc_tm[] =
28072 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28073 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28074 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28075 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28076 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28077 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28078 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28080 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28081 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28082 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28083 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28084 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28085 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28086 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28088 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28089 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28090 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28091 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28092 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28093 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28094 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28096 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
28097 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
28098 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
28101 /* TM callbacks. */
28103 /* Return the builtin decl needed to load a vector of TYPE. */
28105 static tree
28106 ix86_builtin_tm_load (tree type)
28108 if (TREE_CODE (type) == VECTOR_TYPE)
28110 switch (tree_low_cst (TYPE_SIZE (type), 1))
28112 case 64:
28113 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
28114 case 128:
28115 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
28116 case 256:
28117 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
28120 return NULL_TREE;
28123 /* Return the builtin decl needed to store a vector of TYPE. */
28125 static tree
28126 ix86_builtin_tm_store (tree type)
28128 if (TREE_CODE (type) == VECTOR_TYPE)
28130 switch (tree_low_cst (TYPE_SIZE (type), 1))
28132 case 64:
28133 return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
28134 case 128:
28135 return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
28136 case 256:
28137 return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
28140 return NULL_TREE;
28143 /* Initialize the transactional memory vector load/store builtins. */
28145 static void
28146 ix86_init_tm_builtins (void)
28148 enum ix86_builtin_func_type ftype;
28149 const struct builtin_description *d;
28150 size_t i;
28151 tree decl;
28152 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
28153 tree attrs_log, attrs_type_log;
28155 if (!flag_tm)
28156 return;
28158 /* If there are no builtins defined, we must be compiling in a
28159 language without trans-mem support. */
28160 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
28161 return;
28163 /* Use whatever attributes a normal TM load has. */
28164 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
28165 attrs_load = DECL_ATTRIBUTES (decl);
28166 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28167 /* Use whatever attributes a normal TM store has. */
28168 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
28169 attrs_store = DECL_ATTRIBUTES (decl);
28170 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28171 /* Use whatever attributes a normal TM log has. */
28172 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
28173 attrs_log = DECL_ATTRIBUTES (decl);
28174 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28176 for (i = 0, d = bdesc_tm;
28177 i < ARRAY_SIZE (bdesc_tm);
28178 i++, d++)
28180 if ((d->mask & ix86_isa_flags) != 0
28181 || (lang_hooks.builtin_function
28182 == lang_hooks.builtin_function_ext_scope))
28184 tree type, attrs, attrs_type;
28185 enum built_in_function code = (enum built_in_function) d->code;
28187 ftype = (enum ix86_builtin_func_type) d->flag;
28188 type = ix86_get_builtin_func_type (ftype);
28190 if (BUILTIN_TM_LOAD_P (code))
28192 attrs = attrs_load;
28193 attrs_type = attrs_type_load;
28195 else if (BUILTIN_TM_STORE_P (code))
28197 attrs = attrs_store;
28198 attrs_type = attrs_type_store;
28200 else
28202 attrs = attrs_log;
28203 attrs_type = attrs_type_log;
28205 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
28206 /* The builtin without the prefix for
28207 calling it directly. */
28208 d->name + strlen ("__builtin_"),
28209 attrs);
28210 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
28211 set the TYPE_ATTRIBUTES. */
28212 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
28214 set_builtin_decl (code, decl, false);
28219 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
28220 in the current target ISA to allow the user to compile particular modules
28221 with different target specific options that differ from the command line
28222 options. */
28223 static void
28224 ix86_init_mmx_sse_builtins (void)
28226 const struct builtin_description * d;
28227 enum ix86_builtin_func_type ftype;
28228 size_t i;
28230 /* Add all special builtins with variable number of operands. */
28231 for (i = 0, d = bdesc_special_args;
28232 i < ARRAY_SIZE (bdesc_special_args);
28233 i++, d++)
28235 if (d->name == 0)
28236 continue;
28238 ftype = (enum ix86_builtin_func_type) d->flag;
28239 def_builtin (d->mask, d->name, ftype, d->code);
28242 /* Add all builtins with variable number of operands. */
28243 for (i = 0, d = bdesc_args;
28244 i < ARRAY_SIZE (bdesc_args);
28245 i++, d++)
28247 if (d->name == 0)
28248 continue;
28250 ftype = (enum ix86_builtin_func_type) d->flag;
28251 def_builtin_const (d->mask, d->name, ftype, d->code);
28254 /* pcmpestr[im] insns. */
28255 for (i = 0, d = bdesc_pcmpestr;
28256 i < ARRAY_SIZE (bdesc_pcmpestr);
28257 i++, d++)
28259 if (d->code == IX86_BUILTIN_PCMPESTRM128)
28260 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
28261 else
28262 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
28263 def_builtin_const (d->mask, d->name, ftype, d->code);
28266 /* pcmpistr[im] insns. */
28267 for (i = 0, d = bdesc_pcmpistr;
28268 i < ARRAY_SIZE (bdesc_pcmpistr);
28269 i++, d++)
28271 if (d->code == IX86_BUILTIN_PCMPISTRM128)
28272 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
28273 else
28274 ftype = INT_FTYPE_V16QI_V16QI_INT;
28275 def_builtin_const (d->mask, d->name, ftype, d->code);
28278 /* comi/ucomi insns. */
28279 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
28281 if (d->mask == OPTION_MASK_ISA_SSE2)
28282 ftype = INT_FTYPE_V2DF_V2DF;
28283 else
28284 ftype = INT_FTYPE_V4SF_V4SF;
28285 def_builtin_const (d->mask, d->name, ftype, d->code);
28288 /* SSE */
28289 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
28290 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
28291 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
28292 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
28294 /* SSE or 3DNow!A */
28295 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28296 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
28297 IX86_BUILTIN_MASKMOVQ);
28299 /* SSE2 */
28300 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
28301 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
28303 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
28304 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
28305 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
28306 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
28308 /* SSE3. */
28309 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
28310 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
28311 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
28312 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
28314 /* AES */
28315 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
28316 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
28317 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
28318 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
28319 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
28320 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
28321 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
28322 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
28323 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
28324 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
28325 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
28326 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
28328 /* PCLMUL */
28329 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
28330 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
28332 /* RDRND */
28333 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
28334 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
28335 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
28336 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
28337 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
28338 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
28339 IX86_BUILTIN_RDRAND64_STEP);
28341 /* AVX2 */
28342 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
28343 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
28344 IX86_BUILTIN_GATHERSIV2DF);
28346 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
28347 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
28348 IX86_BUILTIN_GATHERSIV4DF);
28350 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
28351 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
28352 IX86_BUILTIN_GATHERDIV2DF);
28354 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
28355 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
28356 IX86_BUILTIN_GATHERDIV4DF);
28358 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
28359 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
28360 IX86_BUILTIN_GATHERSIV4SF);
28362 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
28363 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
28364 IX86_BUILTIN_GATHERSIV8SF);
28366 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
28367 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
28368 IX86_BUILTIN_GATHERDIV4SF);
28370 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
28371 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
28372 IX86_BUILTIN_GATHERDIV8SF);
28374 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
28375 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
28376 IX86_BUILTIN_GATHERSIV2DI);
28378 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
28379 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
28380 IX86_BUILTIN_GATHERSIV4DI);
28382 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
28383 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
28384 IX86_BUILTIN_GATHERDIV2DI);
28386 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
28387 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
28388 IX86_BUILTIN_GATHERDIV4DI);
28390 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
28391 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
28392 IX86_BUILTIN_GATHERSIV4SI);
28394 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
28395 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
28396 IX86_BUILTIN_GATHERSIV8SI);
28398 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
28399 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
28400 IX86_BUILTIN_GATHERDIV4SI);
28402 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
28403 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
28404 IX86_BUILTIN_GATHERDIV8SI);
28406 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
28407 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
28408 IX86_BUILTIN_GATHERALTSIV4DF);
28410 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
28411 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
28412 IX86_BUILTIN_GATHERALTDIV8SF);
28414 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
28415 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
28416 IX86_BUILTIN_GATHERALTSIV4DI);
28418 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
28419 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
28420 IX86_BUILTIN_GATHERALTDIV8SI);
28422 /* RTM. */
28423 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
28424 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
28426 /* MMX access to the vec_init patterns. */
28427 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
28428 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
28430 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
28431 V4HI_FTYPE_HI_HI_HI_HI,
28432 IX86_BUILTIN_VEC_INIT_V4HI);
28434 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
28435 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
28436 IX86_BUILTIN_VEC_INIT_V8QI);
28438 /* Access to the vec_extract patterns. */
28439 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
28440 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
28441 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
28442 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
28443 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
28444 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
28445 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
28446 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
28447 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
28448 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
28450 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28451 "__builtin_ia32_vec_ext_v4hi",
28452 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
28454 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
28455 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
28457 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
28458 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
28460 /* Access to the vec_set patterns. */
28461 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
28462 "__builtin_ia32_vec_set_v2di",
28463 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
28465 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
28466 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
28468 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
28469 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
28471 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
28472 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
28474 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28475 "__builtin_ia32_vec_set_v4hi",
28476 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
28478 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
28479 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
28481 /* RDSEED */
28482 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
28483 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
28484 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
28485 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
28486 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
28487 "__builtin_ia32_rdseed_di_step",
28488 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
28490 /* ADCX */
28491 def_builtin (0, "__builtin_ia32_addcarryx_u32",
28492 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
28493 def_builtin (OPTION_MASK_ISA_64BIT,
28494 "__builtin_ia32_addcarryx_u64",
28495 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
28496 IX86_BUILTIN_ADDCARRYX64);
28498 /* Add FMA4 multi-arg argument instructions */
28499 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
28501 if (d->name == 0)
28502 continue;
28504 ftype = (enum ix86_builtin_func_type) d->flag;
28505 def_builtin_const (d->mask, d->name, ftype, d->code);
28509 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
28510 to return a pointer to VERSION_DECL if the outcome of the expression
28511 formed by PREDICATE_CHAIN is true. This function will be called during
28512 version dispatch to decide which function version to execute. It returns
28513 the basic block at the end, to which more conditions can be added. */
28515 static basic_block
28516 add_condition_to_bb (tree function_decl, tree version_decl,
28517 tree predicate_chain, basic_block new_bb)
28519 gimple return_stmt;
28520 tree convert_expr, result_var;
28521 gimple convert_stmt;
28522 gimple call_cond_stmt;
28523 gimple if_else_stmt;
28525 basic_block bb1, bb2, bb3;
28526 edge e12, e23;
28528 tree cond_var, and_expr_var = NULL_TREE;
28529 gimple_seq gseq;
28531 tree predicate_decl, predicate_arg;
28533 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
28535 gcc_assert (new_bb != NULL);
28536 gseq = bb_seq (new_bb);
28539 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
28540 build_fold_addr_expr (version_decl));
28541 result_var = create_tmp_var (ptr_type_node, NULL);
28542 convert_stmt = gimple_build_assign (result_var, convert_expr);
28543 return_stmt = gimple_build_return (result_var);
28545 if (predicate_chain == NULL_TREE)
28547 gimple_seq_add_stmt (&gseq, convert_stmt);
28548 gimple_seq_add_stmt (&gseq, return_stmt);
28549 set_bb_seq (new_bb, gseq);
28550 gimple_set_bb (convert_stmt, new_bb);
28551 gimple_set_bb (return_stmt, new_bb);
28552 pop_cfun ();
28553 return new_bb;
28556 while (predicate_chain != NULL)
28558 cond_var = create_tmp_var (integer_type_node, NULL);
28559 predicate_decl = TREE_PURPOSE (predicate_chain);
28560 predicate_arg = TREE_VALUE (predicate_chain);
28561 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
28562 gimple_call_set_lhs (call_cond_stmt, cond_var);
28564 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
28565 gimple_set_bb (call_cond_stmt, new_bb);
28566 gimple_seq_add_stmt (&gseq, call_cond_stmt);
28568 predicate_chain = TREE_CHAIN (predicate_chain);
28570 if (and_expr_var == NULL)
28571 and_expr_var = cond_var;
28572 else
28574 gimple assign_stmt;
28575 /* Use MIN_EXPR to check if any integer is zero?.
28576 and_expr_var = min_expr <cond_var, and_expr_var> */
28577 assign_stmt = gimple_build_assign (and_expr_var,
28578 build2 (MIN_EXPR, integer_type_node,
28579 cond_var, and_expr_var));
28581 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
28582 gimple_set_bb (assign_stmt, new_bb);
28583 gimple_seq_add_stmt (&gseq, assign_stmt);
28587 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
28588 integer_zero_node,
28589 NULL_TREE, NULL_TREE);
28590 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
28591 gimple_set_bb (if_else_stmt, new_bb);
28592 gimple_seq_add_stmt (&gseq, if_else_stmt);
28594 gimple_seq_add_stmt (&gseq, convert_stmt);
28595 gimple_seq_add_stmt (&gseq, return_stmt);
28596 set_bb_seq (new_bb, gseq);
28598 bb1 = new_bb;
28599 e12 = split_block (bb1, if_else_stmt);
28600 bb2 = e12->dest;
28601 e12->flags &= ~EDGE_FALLTHRU;
28602 e12->flags |= EDGE_TRUE_VALUE;
28604 e23 = split_block (bb2, return_stmt);
28606 gimple_set_bb (convert_stmt, bb2);
28607 gimple_set_bb (return_stmt, bb2);
28609 bb3 = e23->dest;
28610 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
28612 remove_edge (e23);
28613 make_edge (bb2, EXIT_BLOCK_PTR, 0);
28615 pop_cfun ();
28617 return bb3;
28620 /* This parses the attribute arguments to target in DECL and determines
28621 the right builtin to use to match the platform specification.
28622 It returns the priority value for this version decl. If PREDICATE_LIST
28623 is not NULL, it stores the list of cpu features that need to be checked
28624 before dispatching this function. */
28626 static unsigned int
28627 get_builtin_code_for_version (tree decl, tree *predicate_list)
28629 tree attrs;
28630 struct cl_target_option cur_target;
28631 tree target_node;
28632 struct cl_target_option *new_target;
28633 const char *arg_str = NULL;
28634 const char *attrs_str = NULL;
28635 char *tok_str = NULL;
28636 char *token;
28638 /* Priority of i386 features, greater value is higher priority. This is
28639 used to decide the order in which function dispatch must happen. For
28640 instance, a version specialized for SSE4.2 should be checked for dispatch
28641 before a version for SSE3, as SSE4.2 implies SSE3. */
28642 enum feature_priority
28644 P_ZERO = 0,
28645 P_MMX,
28646 P_SSE,
28647 P_SSE2,
28648 P_SSE3,
28649 P_SSSE3,
28650 P_PROC_SSSE3,
28651 P_SSE4_a,
28652 P_PROC_SSE4_a,
28653 P_SSE4_1,
28654 P_SSE4_2,
28655 P_PROC_SSE4_2,
28656 P_POPCNT,
28657 P_AVX,
28658 P_AVX2,
28659 P_FMA,
28660 P_PROC_FMA
28663 enum feature_priority priority = P_ZERO;
28665 /* These are the target attribute strings for which a dispatcher is
28666 available, from fold_builtin_cpu. */
28668 static struct _feature_list
28670 const char *const name;
28671 const enum feature_priority priority;
28673 const feature_list[] =
28675 {"mmx", P_MMX},
28676 {"sse", P_SSE},
28677 {"sse2", P_SSE2},
28678 {"sse3", P_SSE3},
28679 {"ssse3", P_SSSE3},
28680 {"sse4.1", P_SSE4_1},
28681 {"sse4.2", P_SSE4_2},
28682 {"popcnt", P_POPCNT},
28683 {"avx", P_AVX},
28684 {"avx2", P_AVX2}
28688 static unsigned int NUM_FEATURES
28689 = sizeof (feature_list) / sizeof (struct _feature_list);
28691 unsigned int i;
28693 tree predicate_chain = NULL_TREE;
28694 tree predicate_decl, predicate_arg;
28696 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
28697 gcc_assert (attrs != NULL);
28699 attrs = TREE_VALUE (TREE_VALUE (attrs));
28701 gcc_assert (TREE_CODE (attrs) == STRING_CST);
28702 attrs_str = TREE_STRING_POINTER (attrs);
28705 /* Handle arch= if specified. For priority, set it to be 1 more than
28706 the best instruction set the processor can handle. For instance, if
28707 there is a version for atom and a version for ssse3 (the highest ISA
28708 priority for atom), the atom version must be checked for dispatch
28709 before the ssse3 version. */
28710 if (strstr (attrs_str, "arch=") != NULL)
28712 cl_target_option_save (&cur_target, &global_options);
28713 target_node = ix86_valid_target_attribute_tree (attrs);
28715 gcc_assert (target_node);
28716 new_target = TREE_TARGET_OPTION (target_node);
28717 gcc_assert (new_target);
28719 if (new_target->arch_specified && new_target->arch > 0)
28721 switch (new_target->arch)
28723 case PROCESSOR_CORE2:
28724 arg_str = "core2";
28725 priority = P_PROC_SSSE3;
28726 break;
28727 case PROCESSOR_COREI7:
28728 arg_str = "corei7";
28729 priority = P_PROC_SSE4_2;
28730 break;
28731 case PROCESSOR_ATOM:
28732 arg_str = "atom";
28733 priority = P_PROC_SSSE3;
28734 break;
28735 case PROCESSOR_AMDFAM10:
28736 arg_str = "amdfam10h";
28737 priority = P_PROC_SSE4_a;
28738 break;
28739 case PROCESSOR_BDVER1:
28740 arg_str = "bdver1";
28741 priority = P_PROC_FMA;
28742 break;
28743 case PROCESSOR_BDVER2:
28744 arg_str = "bdver2";
28745 priority = P_PROC_FMA;
28746 break;
28750 cl_target_option_restore (&global_options, &cur_target);
28752 if (predicate_list && arg_str == NULL)
28754 error_at (DECL_SOURCE_LOCATION (decl),
28755 "No dispatcher found for the versioning attributes");
28756 return 0;
28759 if (predicate_list)
28761 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
28762 /* For a C string literal the length includes the trailing NULL. */
28763 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
28764 predicate_chain = tree_cons (predicate_decl, predicate_arg,
28765 predicate_chain);
28769 /* Process feature name. */
28770 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
28771 strcpy (tok_str, attrs_str);
28772 token = strtok (tok_str, ",");
28773 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
28775 while (token != NULL)
28777 /* Do not process "arch=" */
28778 if (strncmp (token, "arch=", 5) == 0)
28780 token = strtok (NULL, ",");
28781 continue;
28783 for (i = 0; i < NUM_FEATURES; ++i)
28785 if (strcmp (token, feature_list[i].name) == 0)
28787 if (predicate_list)
28789 predicate_arg = build_string_literal (
28790 strlen (feature_list[i].name) + 1,
28791 feature_list[i].name);
28792 predicate_chain = tree_cons (predicate_decl, predicate_arg,
28793 predicate_chain);
28795 /* Find the maximum priority feature. */
28796 if (feature_list[i].priority > priority)
28797 priority = feature_list[i].priority;
28799 break;
28802 if (predicate_list && i == NUM_FEATURES)
28804 error_at (DECL_SOURCE_LOCATION (decl),
28805 "No dispatcher found for %s", token);
28806 return 0;
28808 token = strtok (NULL, ",");
28810 free (tok_str);
28812 if (predicate_list && predicate_chain == NULL_TREE)
28814 error_at (DECL_SOURCE_LOCATION (decl),
28815 "No dispatcher found for the versioning attributes : %s",
28816 attrs_str);
28817 return 0;
28819 else if (predicate_list)
28821 predicate_chain = nreverse (predicate_chain);
28822 *predicate_list = predicate_chain;
28825 return priority;
28828 /* This compares the priority of target features in function DECL1
28829 and DECL2. It returns positive value if DECL1 is higher priority,
28830 negative value if DECL2 is higher priority and 0 if they are the
28831 same. */
28833 static int
28834 ix86_compare_version_priority (tree decl1, tree decl2)
28836 unsigned int priority1 = 0;
28837 unsigned int priority2 = 0;
28839 if (lookup_attribute ("target", DECL_ATTRIBUTES (decl1)) != NULL)
28840 priority1 = get_builtin_code_for_version (decl1, NULL);
28842 if (lookup_attribute ("target", DECL_ATTRIBUTES (decl2)) != NULL)
28843 priority2 = get_builtin_code_for_version (decl2, NULL);
28845 return (int)priority1 - (int)priority2;
28848 /* V1 and V2 point to function versions with different priorities
28849 based on the target ISA. This function compares their priorities. */
28851 static int
28852 feature_compare (const void *v1, const void *v2)
28854 typedef struct _function_version_info
28856 tree version_decl;
28857 tree predicate_chain;
28858 unsigned int dispatch_priority;
28859 } function_version_info;
28861 const function_version_info c1 = *(const function_version_info *)v1;
28862 const function_version_info c2 = *(const function_version_info *)v2;
28863 return (c2.dispatch_priority - c1.dispatch_priority);
28866 /* This function generates the dispatch function for
28867 multi-versioned functions. DISPATCH_DECL is the function which will
28868 contain the dispatch logic. FNDECLS are the function choices for
28869 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
28870 in DISPATCH_DECL in which the dispatch code is generated. */
28872 static int
28873 dispatch_function_versions (tree dispatch_decl,
28874 void *fndecls_p,
28875 basic_block *empty_bb)
28877 tree default_decl;
28878 gimple ifunc_cpu_init_stmt;
28879 gimple_seq gseq;
28880 int ix;
28881 tree ele;
28882 vec<tree> *fndecls;
28883 unsigned int num_versions = 0;
28884 unsigned int actual_versions = 0;
28885 unsigned int i;
28887 struct _function_version_info
28889 tree version_decl;
28890 tree predicate_chain;
28891 unsigned int dispatch_priority;
28892 }*function_version_info;
28894 gcc_assert (dispatch_decl != NULL
28895 && fndecls_p != NULL
28896 && empty_bb != NULL);
28898 /*fndecls_p is actually a vector. */
28899 fndecls = static_cast<vec<tree> *> (fndecls_p);
28901 /* At least one more version other than the default. */
28902 num_versions = fndecls->length ();
28903 gcc_assert (num_versions >= 2);
28905 function_version_info = (struct _function_version_info *)
28906 XNEWVEC (struct _function_version_info, (num_versions - 1));
28908 /* The first version in the vector is the default decl. */
28909 default_decl = (*fndecls)[0];
28911 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
28913 gseq = bb_seq (*empty_bb);
28914 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
28915 constructors, so explicity call __builtin_cpu_init here. */
28916 ifunc_cpu_init_stmt = gimple_build_call_vec (
28917 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
28918 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
28919 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
28920 set_bb_seq (*empty_bb, gseq);
28922 pop_cfun ();
28925 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
28927 tree version_decl = ele;
28928 tree predicate_chain = NULL_TREE;
28929 unsigned int priority;
28930 /* Get attribute string, parse it and find the right predicate decl.
28931 The predicate function could be a lengthy combination of many
28932 features, like arch-type and various isa-variants. */
28933 priority = get_builtin_code_for_version (version_decl,
28934 &predicate_chain);
28936 if (predicate_chain == NULL_TREE)
28937 continue;
28939 actual_versions++;
28940 function_version_info [ix - 1].version_decl = version_decl;
28941 function_version_info [ix - 1].predicate_chain = predicate_chain;
28942 function_version_info [ix - 1].dispatch_priority = priority;
28945 /* Sort the versions according to descending order of dispatch priority. The
28946 priority is based on the ISA. This is not a perfect solution. There
28947 could still be ambiguity. If more than one function version is suitable
28948 to execute, which one should be dispatched? In future, allow the user
28949 to specify a dispatch priority next to the version. */
28950 qsort (function_version_info, actual_versions,
28951 sizeof (struct _function_version_info), feature_compare);
28953 for (i = 0; i < actual_versions; ++i)
28954 *empty_bb = add_condition_to_bb (dispatch_decl,
28955 function_version_info[i].version_decl,
28956 function_version_info[i].predicate_chain,
28957 *empty_bb);
28959 /* dispatch default version at the end. */
28960 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
28961 NULL, *empty_bb);
28963 free (function_version_info);
28964 return 0;
28967 /* Comparator function to be used in qsort routine to sort attribute
28968 specification strings to "target". */
28970 static int
28971 attr_strcmp (const void *v1, const void *v2)
28973 const char *c1 = *(char *const*)v1;
28974 const char *c2 = *(char *const*)v2;
28975 return strcmp (c1, c2);
28978 /* ARGLIST is the argument to target attribute. This function tokenizes
28979 the comma separated arguments, sorts them and returns a string which
28980 is a unique identifier for the comma separated arguments. It also
28981 replaces non-identifier characters "=,-" with "_". */
28983 static char *
28984 sorted_attr_string (tree arglist)
28986 tree arg;
28987 size_t str_len_sum = 0;
28988 char **args = NULL;
28989 char *attr_str, *ret_str;
28990 char *attr = NULL;
28991 unsigned int argnum = 1;
28992 unsigned int i;
28994 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
28996 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
28997 size_t len = strlen (str);
28998 str_len_sum += len + 1;
28999 if (arg != arglist)
29000 argnum++;
29001 for (i = 0; i < strlen (str); i++)
29002 if (str[i] == ',')
29003 argnum++;
29006 attr_str = XNEWVEC (char, str_len_sum);
29007 str_len_sum = 0;
29008 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
29010 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
29011 size_t len = strlen (str);
29012 memcpy (attr_str + str_len_sum, str, len);
29013 attr_str[str_len_sum + len] = TREE_CHAIN (arg) ? ',' : '\0';
29014 str_len_sum += len + 1;
29017 /* Replace "=,-" with "_". */
29018 for (i = 0; i < strlen (attr_str); i++)
29019 if (attr_str[i] == '=' || attr_str[i]== '-')
29020 attr_str[i] = '_';
29022 if (argnum == 1)
29023 return attr_str;
29025 args = XNEWVEC (char *, argnum);
29027 i = 0;
29028 attr = strtok (attr_str, ",");
29029 while (attr != NULL)
29031 args[i] = attr;
29032 i++;
29033 attr = strtok (NULL, ",");
29036 qsort (args, argnum, sizeof (char *), attr_strcmp);
29038 ret_str = XNEWVEC (char, str_len_sum);
29039 str_len_sum = 0;
29040 for (i = 0; i < argnum; i++)
29042 size_t len = strlen (args[i]);
29043 memcpy (ret_str + str_len_sum, args[i], len);
29044 ret_str[str_len_sum + len] = i < argnum - 1 ? '_' : '\0';
29045 str_len_sum += len + 1;
29048 XDELETEVEC (args);
29049 XDELETEVEC (attr_str);
29050 return ret_str;
29053 /* This function changes the assembler name for functions that are
29054 versions. If DECL is a function version and has a "target"
29055 attribute, it appends the attribute string to its assembler name. */
29057 static tree
29058 ix86_mangle_function_version_assembler_name (tree decl, tree id)
29060 tree version_attr;
29061 const char *orig_name, *version_string;
29062 char *attr_str, *assembler_name;
29064 if (DECL_DECLARED_INLINE_P (decl)
29065 && lookup_attribute ("gnu_inline",
29066 DECL_ATTRIBUTES (decl)))
29067 error_at (DECL_SOURCE_LOCATION (decl),
29068 "Function versions cannot be marked as gnu_inline,"
29069 " bodies have to be generated");
29071 if (DECL_VIRTUAL_P (decl)
29072 || DECL_VINDEX (decl))
29073 error_at (DECL_SOURCE_LOCATION (decl),
29074 "Virtual function versioning not supported\n");
29076 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
29078 /* target attribute string is NULL for default functions. */
29079 if (version_attr == NULL_TREE)
29080 return id;
29082 orig_name = IDENTIFIER_POINTER (id);
29083 version_string
29084 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
29086 if (strcmp (version_string, "default") == 0)
29087 return id;
29089 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
29090 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
29092 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
29094 /* Allow assembler name to be modified if already set. */
29095 if (DECL_ASSEMBLER_NAME_SET_P (decl))
29096 SET_DECL_RTL (decl, NULL);
29098 tree ret = get_identifier (assembler_name);
29099 XDELETEVEC (attr_str);
29100 XDELETEVEC (assembler_name);
29101 return ret;
29104 /* This function returns true if FN1 and FN2 are versions of the same function,
29105 that is, the target strings of the function decls are different. This assumes
29106 that FN1 and FN2 have the same signature. */
29108 static bool
29109 ix86_function_versions (tree fn1, tree fn2)
29111 tree attr1, attr2;
29112 char *target1, *target2;
29113 bool result;
29115 if (TREE_CODE (fn1) != FUNCTION_DECL
29116 || TREE_CODE (fn2) != FUNCTION_DECL)
29117 return false;
29119 attr1 = lookup_attribute ("target", DECL_ATTRIBUTES (fn1));
29120 attr2 = lookup_attribute ("target", DECL_ATTRIBUTES (fn2));
29122 /* At least one function decl should have the target attribute specified. */
29123 if (attr1 == NULL_TREE && attr2 == NULL_TREE)
29124 return false;
29126 /* Diagnose missing target attribute if one of the decls is already
29127 multi-versioned. */
29128 if (attr1 == NULL_TREE || attr2 == NULL_TREE)
29130 if (DECL_FUNCTION_VERSIONED (fn1) || DECL_FUNCTION_VERSIONED (fn2))
29132 if (attr2 != NULL_TREE)
29134 tree tem = fn1;
29135 fn1 = fn2;
29136 fn2 = tem;
29137 attr1 = attr2;
29139 error_at (DECL_SOURCE_LOCATION (fn2),
29140 "missing %<target%> attribute for multi-versioned %D",
29141 fn2);
29142 error_at (DECL_SOURCE_LOCATION (fn1),
29143 "previous declaration of %D", fn1);
29144 /* Prevent diagnosing of the same error multiple times. */
29145 DECL_ATTRIBUTES (fn2)
29146 = tree_cons (get_identifier ("target"),
29147 copy_node (TREE_VALUE (attr1)),
29148 DECL_ATTRIBUTES (fn2));
29150 return false;
29153 target1 = sorted_attr_string (TREE_VALUE (attr1));
29154 target2 = sorted_attr_string (TREE_VALUE (attr2));
29156 /* The sorted target strings must be different for fn1 and fn2
29157 to be versions. */
29158 if (strcmp (target1, target2) == 0)
29159 result = false;
29160 else
29161 result = true;
29163 XDELETEVEC (target1);
29164 XDELETEVEC (target2);
29166 return result;
29169 static tree
29170 ix86_mangle_decl_assembler_name (tree decl, tree id)
29172 /* For function version, add the target suffix to the assembler name. */
29173 if (TREE_CODE (decl) == FUNCTION_DECL
29174 && DECL_FUNCTION_VERSIONED (decl))
29175 id = ix86_mangle_function_version_assembler_name (decl, id);
29176 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
29177 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
29178 #endif
29180 return id;
29183 /* Return a new name by appending SUFFIX to the DECL name. If make_unique
29184 is true, append the full path name of the source file. */
29186 static char *
29187 make_name (tree decl, const char *suffix, bool make_unique)
29189 char *global_var_name;
29190 int name_len;
29191 const char *name;
29192 const char *unique_name = NULL;
29194 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
29196 /* Get a unique name that can be used globally without any chances
29197 of collision at link time. */
29198 if (make_unique)
29199 unique_name = IDENTIFIER_POINTER (get_file_function_name ("\0"));
29201 name_len = strlen (name) + strlen (suffix) + 2;
29203 if (make_unique)
29204 name_len += strlen (unique_name) + 1;
29205 global_var_name = XNEWVEC (char, name_len);
29207 /* Use '.' to concatenate names as it is demangler friendly. */
29208 if (make_unique)
29209 snprintf (global_var_name, name_len, "%s.%s.%s", name, unique_name,
29210 suffix);
29211 else
29212 snprintf (global_var_name, name_len, "%s.%s", name, suffix);
29214 return global_var_name;
29217 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE) && HAVE_GNU_INDIRECT_FUNCTION
29219 /* Make a dispatcher declaration for the multi-versioned function DECL.
29220 Calls to DECL function will be replaced with calls to the dispatcher
29221 by the front-end. Return the decl created. */
29223 static tree
29224 make_dispatcher_decl (const tree decl)
29226 tree func_decl;
29227 char *func_name;
29228 tree fn_type, func_type;
29229 bool is_uniq = false;
29231 if (TREE_PUBLIC (decl) == 0)
29232 is_uniq = true;
29234 func_name = make_name (decl, "ifunc", is_uniq);
29236 fn_type = TREE_TYPE (decl);
29237 func_type = build_function_type (TREE_TYPE (fn_type),
29238 TYPE_ARG_TYPES (fn_type));
29240 func_decl = build_fn_decl (func_name, func_type);
29241 XDELETEVEC (func_name);
29242 TREE_USED (func_decl) = 1;
29243 DECL_CONTEXT (func_decl) = NULL_TREE;
29244 DECL_INITIAL (func_decl) = error_mark_node;
29245 DECL_ARTIFICIAL (func_decl) = 1;
29246 /* Mark this func as external, the resolver will flip it again if
29247 it gets generated. */
29248 DECL_EXTERNAL (func_decl) = 1;
29249 /* This will be of type IFUNCs have to be externally visible. */
29250 TREE_PUBLIC (func_decl) = 1;
29252 return func_decl;
29255 #endif
29257 /* Returns true if decl is multi-versioned and DECL is the default function,
29258 that is it is not tagged with target specific optimization. */
29260 static bool
29261 is_function_default_version (const tree decl)
29263 if (TREE_CODE (decl) != FUNCTION_DECL
29264 || !DECL_FUNCTION_VERSIONED (decl))
29265 return false;
29266 tree attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
29267 gcc_assert (attr);
29268 attr = TREE_VALUE (TREE_VALUE (attr));
29269 return (TREE_CODE (attr) == STRING_CST
29270 && strcmp (TREE_STRING_POINTER (attr), "default") == 0);
29273 /* Make a dispatcher declaration for the multi-versioned function DECL.
29274 Calls to DECL function will be replaced with calls to the dispatcher
29275 by the front-end. Returns the decl of the dispatcher function. */
29277 static tree
29278 ix86_get_function_versions_dispatcher (void *decl)
29280 tree fn = (tree) decl;
29281 struct cgraph_node *node = NULL;
29282 struct cgraph_node *default_node = NULL;
29283 struct cgraph_function_version_info *node_v = NULL;
29284 struct cgraph_function_version_info *first_v = NULL;
29286 tree dispatch_decl = NULL;
29288 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE) && HAVE_GNU_INDIRECT_FUNCTION
29289 struct cgraph_function_version_info *it_v = NULL;
29290 struct cgraph_node *dispatcher_node = NULL;
29291 struct cgraph_function_version_info *dispatcher_version_info = NULL;
29292 #endif
29294 struct cgraph_function_version_info *default_version_info = NULL;
29296 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
29298 node = cgraph_get_node (fn);
29299 gcc_assert (node != NULL);
29301 node_v = get_cgraph_node_version (node);
29302 gcc_assert (node_v != NULL);
29304 if (node_v->dispatcher_resolver != NULL)
29305 return node_v->dispatcher_resolver;
29307 /* Find the default version and make it the first node. */
29308 first_v = node_v;
29309 /* Go to the beginnig of the chain. */
29310 while (first_v->prev != NULL)
29311 first_v = first_v->prev;
29312 default_version_info = first_v;
29313 while (default_version_info != NULL)
29315 if (is_function_default_version
29316 (default_version_info->this_node->symbol.decl))
29317 break;
29318 default_version_info = default_version_info->next;
29321 /* If there is no default node, just return NULL. */
29322 if (default_version_info == NULL)
29323 return NULL;
29325 /* Make default info the first node. */
29326 if (first_v != default_version_info)
29328 default_version_info->prev->next = default_version_info->next;
29329 if (default_version_info->next)
29330 default_version_info->next->prev = default_version_info->prev;
29331 first_v->prev = default_version_info;
29332 default_version_info->next = first_v;
29333 default_version_info->prev = NULL;
29336 default_node = default_version_info->this_node;
29338 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE) && HAVE_GNU_INDIRECT_FUNCTION
29339 /* Right now, the dispatching is done via ifunc. */
29340 dispatch_decl = make_dispatcher_decl (default_node->symbol.decl);
29342 dispatcher_node = cgraph_get_create_node (dispatch_decl);
29343 gcc_assert (dispatcher_node != NULL);
29344 dispatcher_node->dispatcher_function = 1;
29345 dispatcher_version_info
29346 = insert_new_cgraph_node_version (dispatcher_node);
29347 dispatcher_version_info->next = default_version_info;
29348 dispatcher_node->local.finalized = 1;
29350 /* Set the dispatcher for all the versions. */
29351 it_v = default_version_info;
29352 while (it_v != NULL)
29354 it_v->dispatcher_resolver = dispatch_decl;
29355 it_v = it_v->next;
29357 #else
29358 error_at (DECL_SOURCE_LOCATION (default_node->symbol.decl),
29359 "multiversioning needs ifunc which is not supported "
29360 "in this configuration");
29361 #endif
29362 return dispatch_decl;
29365 /* Makes a function attribute of the form NAME(ARG_NAME) and chains
29366 it to CHAIN. */
29368 static tree
29369 make_attribute (const char *name, const char *arg_name, tree chain)
29371 tree attr_name;
29372 tree attr_arg_name;
29373 tree attr_args;
29374 tree attr;
29376 attr_name = get_identifier (name);
29377 attr_arg_name = build_string (strlen (arg_name), arg_name);
29378 attr_args = tree_cons (NULL_TREE, attr_arg_name, NULL_TREE);
29379 attr = tree_cons (attr_name, attr_args, chain);
29380 return attr;
29383 /* Make the resolver function decl to dispatch the versions of
29384 a multi-versioned function, DEFAULT_DECL. Create an
29385 empty basic block in the resolver and store the pointer in
29386 EMPTY_BB. Return the decl of the resolver function. */
29388 static tree
29389 make_resolver_func (const tree default_decl,
29390 const tree dispatch_decl,
29391 basic_block *empty_bb)
29393 char *resolver_name;
29394 tree decl, type, decl_name, t;
29395 bool is_uniq = false;
29397 /* IFUNC's have to be globally visible. So, if the default_decl is
29398 not, then the name of the IFUNC should be made unique. */
29399 if (TREE_PUBLIC (default_decl) == 0)
29400 is_uniq = true;
29402 /* Append the filename to the resolver function if the versions are
29403 not externally visible. This is because the resolver function has
29404 to be externally visible for the loader to find it. So, appending
29405 the filename will prevent conflicts with a resolver function from
29406 another module which is based on the same version name. */
29407 resolver_name = make_name (default_decl, "resolver", is_uniq);
29409 /* The resolver function should return a (void *). */
29410 type = build_function_type_list (ptr_type_node, NULL_TREE);
29412 decl = build_fn_decl (resolver_name, type);
29413 decl_name = get_identifier (resolver_name);
29414 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
29416 DECL_NAME (decl) = decl_name;
29417 TREE_USED (decl) = 1;
29418 DECL_ARTIFICIAL (decl) = 1;
29419 DECL_IGNORED_P (decl) = 0;
29420 /* IFUNC resolvers have to be externally visible. */
29421 TREE_PUBLIC (decl) = 1;
29422 DECL_UNINLINABLE (decl) = 0;
29424 /* Resolver is not external, body is generated. */
29425 DECL_EXTERNAL (decl) = 0;
29426 DECL_EXTERNAL (dispatch_decl) = 0;
29428 DECL_CONTEXT (decl) = NULL_TREE;
29429 DECL_INITIAL (decl) = make_node (BLOCK);
29430 DECL_STATIC_CONSTRUCTOR (decl) = 0;
29432 if (DECL_COMDAT_GROUP (default_decl)
29433 || TREE_PUBLIC (default_decl))
29435 /* In this case, each translation unit with a call to this
29436 versioned function will put out a resolver. Ensure it
29437 is comdat to keep just one copy. */
29438 DECL_COMDAT (decl) = 1;
29439 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
29441 /* Build result decl and add to function_decl. */
29442 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
29443 DECL_ARTIFICIAL (t) = 1;
29444 DECL_IGNORED_P (t) = 1;
29445 DECL_RESULT (decl) = t;
29447 gimplify_function_tree (decl);
29448 push_cfun (DECL_STRUCT_FUNCTION (decl));
29449 *empty_bb = init_lowered_empty_function (decl, false);
29451 cgraph_add_new_function (decl, true);
29452 cgraph_call_function_insertion_hooks (cgraph_get_create_node (decl));
29454 pop_cfun ();
29456 gcc_assert (dispatch_decl != NULL);
29457 /* Mark dispatch_decl as "ifunc" with resolver as resolver_name. */
29458 DECL_ATTRIBUTES (dispatch_decl)
29459 = make_attribute ("ifunc", resolver_name, DECL_ATTRIBUTES (dispatch_decl));
29461 /* Create the alias for dispatch to resolver here. */
29462 /*cgraph_create_function_alias (dispatch_decl, decl);*/
29463 cgraph_same_body_alias (NULL, dispatch_decl, decl);
29464 XDELETEVEC (resolver_name);
29465 return decl;
29468 /* Generate the dispatching code body to dispatch multi-versioned function
29469 DECL. The target hook is called to process the "target" attributes and
29470 provide the code to dispatch the right function at run-time. NODE points
29471 to the dispatcher decl whose body will be created. */
29473 static tree
29474 ix86_generate_version_dispatcher_body (void *node_p)
29476 tree resolver_decl;
29477 basic_block empty_bb;
29478 vec<tree> fn_ver_vec = vNULL;
29479 tree default_ver_decl;
29480 struct cgraph_node *versn;
29481 struct cgraph_node *node;
29483 struct cgraph_function_version_info *node_version_info = NULL;
29484 struct cgraph_function_version_info *versn_info = NULL;
29486 node = (cgraph_node *)node_p;
29488 node_version_info = get_cgraph_node_version (node);
29489 gcc_assert (node->dispatcher_function
29490 && node_version_info != NULL);
29492 if (node_version_info->dispatcher_resolver)
29493 return node_version_info->dispatcher_resolver;
29495 /* The first version in the chain corresponds to the default version. */
29496 default_ver_decl = node_version_info->next->this_node->symbol.decl;
29498 /* node is going to be an alias, so remove the finalized bit. */
29499 node->local.finalized = false;
29501 resolver_decl = make_resolver_func (default_ver_decl,
29502 node->symbol.decl, &empty_bb);
29504 node_version_info->dispatcher_resolver = resolver_decl;
29506 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
29508 fn_ver_vec.create (2);
29510 for (versn_info = node_version_info->next; versn_info;
29511 versn_info = versn_info->next)
29513 versn = versn_info->this_node;
29514 /* Check for virtual functions here again, as by this time it should
29515 have been determined if this function needs a vtable index or
29516 not. This happens for methods in derived classes that override
29517 virtual methods in base classes but are not explicitly marked as
29518 virtual. */
29519 if (DECL_VINDEX (versn->symbol.decl))
29520 error_at (DECL_SOURCE_LOCATION (versn->symbol.decl),
29521 "Virtual function multiversioning not supported");
29522 fn_ver_vec.safe_push (versn->symbol.decl);
29525 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
29526 fn_ver_vec.release ();
29527 rebuild_cgraph_edges ();
29528 pop_cfun ();
29529 return resolver_decl;
29531 /* This builds the processor_model struct type defined in
29532 libgcc/config/i386/cpuinfo.c */
29534 static tree
29535 build_processor_model_struct (void)
29537 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
29538 "__cpu_features"};
29539 tree field = NULL_TREE, field_chain = NULL_TREE;
29540 int i;
29541 tree type = make_node (RECORD_TYPE);
29543 /* The first 3 fields are unsigned int. */
29544 for (i = 0; i < 3; ++i)
29546 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
29547 get_identifier (field_name[i]), unsigned_type_node);
29548 if (field_chain != NULL_TREE)
29549 DECL_CHAIN (field) = field_chain;
29550 field_chain = field;
29553 /* The last field is an array of unsigned integers of size one. */
29554 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
29555 get_identifier (field_name[3]),
29556 build_array_type (unsigned_type_node,
29557 build_index_type (size_one_node)));
29558 if (field_chain != NULL_TREE)
29559 DECL_CHAIN (field) = field_chain;
29560 field_chain = field;
29562 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
29563 return type;
29566 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
29568 static tree
29569 make_var_decl (tree type, const char *name)
29571 tree new_decl;
29573 new_decl = build_decl (UNKNOWN_LOCATION,
29574 VAR_DECL,
29575 get_identifier(name),
29576 type);
29578 DECL_EXTERNAL (new_decl) = 1;
29579 TREE_STATIC (new_decl) = 1;
29580 TREE_PUBLIC (new_decl) = 1;
29581 DECL_INITIAL (new_decl) = 0;
29582 DECL_ARTIFICIAL (new_decl) = 0;
29583 DECL_PRESERVE_P (new_decl) = 1;
29585 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
29586 assemble_variable (new_decl, 0, 0, 0);
29588 return new_decl;
29591 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
29592 into an integer defined in libgcc/config/i386/cpuinfo.c */
29594 static tree
29595 fold_builtin_cpu (tree fndecl, tree *args)
29597 unsigned int i;
29598 enum ix86_builtins fn_code = (enum ix86_builtins)
29599 DECL_FUNCTION_CODE (fndecl);
29600 tree param_string_cst = NULL;
29602 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
29603 enum processor_features
29605 F_CMOV = 0,
29606 F_MMX,
29607 F_POPCNT,
29608 F_SSE,
29609 F_SSE2,
29610 F_SSE3,
29611 F_SSSE3,
29612 F_SSE4_1,
29613 F_SSE4_2,
29614 F_AVX,
29615 F_AVX2,
29616 F_MAX
29619 /* These are the values for vendor types and cpu types and subtypes
29620 in cpuinfo.c. Cpu types and subtypes should be subtracted by
29621 the corresponding start value. */
29622 enum processor_model
29624 M_INTEL = 1,
29625 M_AMD,
29626 M_CPU_TYPE_START,
29627 M_INTEL_ATOM,
29628 M_INTEL_CORE2,
29629 M_INTEL_COREI7,
29630 M_AMDFAM10H,
29631 M_AMDFAM15H,
29632 M_CPU_SUBTYPE_START,
29633 M_INTEL_COREI7_NEHALEM,
29634 M_INTEL_COREI7_WESTMERE,
29635 M_INTEL_COREI7_SANDYBRIDGE,
29636 M_AMDFAM10H_BARCELONA,
29637 M_AMDFAM10H_SHANGHAI,
29638 M_AMDFAM10H_ISTANBUL,
29639 M_AMDFAM15H_BDVER1,
29640 M_AMDFAM15H_BDVER2,
29641 M_AMDFAM15H_BDVER3
29644 static struct _arch_names_table
29646 const char *const name;
29647 const enum processor_model model;
29649 const arch_names_table[] =
29651 {"amd", M_AMD},
29652 {"intel", M_INTEL},
29653 {"atom", M_INTEL_ATOM},
29654 {"core2", M_INTEL_CORE2},
29655 {"corei7", M_INTEL_COREI7},
29656 {"nehalem", M_INTEL_COREI7_NEHALEM},
29657 {"westmere", M_INTEL_COREI7_WESTMERE},
29658 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
29659 {"amdfam10h", M_AMDFAM10H},
29660 {"barcelona", M_AMDFAM10H_BARCELONA},
29661 {"shanghai", M_AMDFAM10H_SHANGHAI},
29662 {"istanbul", M_AMDFAM10H_ISTANBUL},
29663 {"amdfam15h", M_AMDFAM15H},
29664 {"bdver1", M_AMDFAM15H_BDVER1},
29665 {"bdver2", M_AMDFAM15H_BDVER2},
29666 {"bdver3", M_AMDFAM15H_BDVER3},
29669 static struct _isa_names_table
29671 const char *const name;
29672 const enum processor_features feature;
29674 const isa_names_table[] =
29676 {"cmov", F_CMOV},
29677 {"mmx", F_MMX},
29678 {"popcnt", F_POPCNT},
29679 {"sse", F_SSE},
29680 {"sse2", F_SSE2},
29681 {"sse3", F_SSE3},
29682 {"ssse3", F_SSSE3},
29683 {"sse4.1", F_SSE4_1},
29684 {"sse4.2", F_SSE4_2},
29685 {"avx", F_AVX},
29686 {"avx2", F_AVX2}
29689 tree __processor_model_type = build_processor_model_struct ();
29690 tree __cpu_model_var = make_var_decl (__processor_model_type,
29691 "__cpu_model");
29693 gcc_assert ((args != NULL) && (*args != NULL));
29695 param_string_cst = *args;
29696 while (param_string_cst
29697 && TREE_CODE (param_string_cst) != STRING_CST)
29699 /* *args must be a expr that can contain other EXPRS leading to a
29700 STRING_CST. */
29701 if (!EXPR_P (param_string_cst))
29703 error ("Parameter to builtin must be a string constant or literal");
29704 return integer_zero_node;
29706 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
29709 gcc_assert (param_string_cst);
29711 if (fn_code == IX86_BUILTIN_CPU_IS)
29713 tree ref;
29714 tree field;
29715 tree final;
29717 unsigned int field_val = 0;
29718 unsigned int NUM_ARCH_NAMES
29719 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
29721 for (i = 0; i < NUM_ARCH_NAMES; i++)
29722 if (strcmp (arch_names_table[i].name,
29723 TREE_STRING_POINTER (param_string_cst)) == 0)
29724 break;
29726 if (i == NUM_ARCH_NAMES)
29728 error ("Parameter to builtin not valid: %s",
29729 TREE_STRING_POINTER (param_string_cst));
29730 return integer_zero_node;
29733 field = TYPE_FIELDS (__processor_model_type);
29734 field_val = arch_names_table[i].model;
29736 /* CPU types are stored in the next field. */
29737 if (field_val > M_CPU_TYPE_START
29738 && field_val < M_CPU_SUBTYPE_START)
29740 field = DECL_CHAIN (field);
29741 field_val -= M_CPU_TYPE_START;
29744 /* CPU subtypes are stored in the next field. */
29745 if (field_val > M_CPU_SUBTYPE_START)
29747 field = DECL_CHAIN ( DECL_CHAIN (field));
29748 field_val -= M_CPU_SUBTYPE_START;
29751 /* Get the appropriate field in __cpu_model. */
29752 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
29753 field, NULL_TREE);
29755 /* Check the value. */
29756 final = build2 (EQ_EXPR, unsigned_type_node, ref,
29757 build_int_cstu (unsigned_type_node, field_val));
29758 return build1 (CONVERT_EXPR, integer_type_node, final);
29760 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
29762 tree ref;
29763 tree array_elt;
29764 tree field;
29765 tree final;
29767 unsigned int field_val = 0;
29768 unsigned int NUM_ISA_NAMES
29769 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
29771 for (i = 0; i < NUM_ISA_NAMES; i++)
29772 if (strcmp (isa_names_table[i].name,
29773 TREE_STRING_POINTER (param_string_cst)) == 0)
29774 break;
29776 if (i == NUM_ISA_NAMES)
29778 error ("Parameter to builtin not valid: %s",
29779 TREE_STRING_POINTER (param_string_cst));
29780 return integer_zero_node;
29783 field = TYPE_FIELDS (__processor_model_type);
29784 /* Get the last field, which is __cpu_features. */
29785 while (DECL_CHAIN (field))
29786 field = DECL_CHAIN (field);
29788 /* Get the appropriate field: __cpu_model.__cpu_features */
29789 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
29790 field, NULL_TREE);
29792 /* Access the 0th element of __cpu_features array. */
29793 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
29794 integer_zero_node, NULL_TREE, NULL_TREE);
29796 field_val = (1 << isa_names_table[i].feature);
29797 /* Return __cpu_model.__cpu_features[0] & field_val */
29798 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
29799 build_int_cstu (unsigned_type_node, field_val));
29800 return build1 (CONVERT_EXPR, integer_type_node, final);
29802 gcc_unreachable ();
29805 static tree
29806 ix86_fold_builtin (tree fndecl, int n_args,
29807 tree *args, bool ignore ATTRIBUTE_UNUSED)
29809 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
29811 enum ix86_builtins fn_code = (enum ix86_builtins)
29812 DECL_FUNCTION_CODE (fndecl);
29813 if (fn_code == IX86_BUILTIN_CPU_IS
29814 || fn_code == IX86_BUILTIN_CPU_SUPPORTS)
29816 gcc_assert (n_args == 1);
29817 return fold_builtin_cpu (fndecl, args);
29821 #ifdef SUBTARGET_FOLD_BUILTIN
29822 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
29823 #endif
29825 return NULL_TREE;
29828 /* Make builtins to detect cpu type and features supported. NAME is
29829 the builtin name, CODE is the builtin code, and FTYPE is the function
29830 type of the builtin. */
29832 static void
29833 make_cpu_type_builtin (const char* name, int code,
29834 enum ix86_builtin_func_type ftype, bool is_const)
29836 tree decl;
29837 tree type;
29839 type = ix86_get_builtin_func_type (ftype);
29840 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
29841 NULL, NULL_TREE);
29842 gcc_assert (decl != NULL_TREE);
29843 ix86_builtins[(int) code] = decl;
29844 TREE_READONLY (decl) = is_const;
29847 /* Make builtins to get CPU type and features supported. The created
29848 builtins are :
29850 __builtin_cpu_init (), to detect cpu type and features,
29851 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
29852 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
29855 static void
29856 ix86_init_platform_type_builtins (void)
29858 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
29859 INT_FTYPE_VOID, false);
29860 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
29861 INT_FTYPE_PCCHAR, true);
29862 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
29863 INT_FTYPE_PCCHAR, true);
29866 /* Internal method for ix86_init_builtins. */
29868 static void
29869 ix86_init_builtins_va_builtins_abi (void)
29871 tree ms_va_ref, sysv_va_ref;
29872 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
29873 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
29874 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
29875 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
29877 if (!TARGET_64BIT)
29878 return;
29879 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
29880 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
29881 ms_va_ref = build_reference_type (ms_va_list_type_node);
29882 sysv_va_ref =
29883 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
29885 fnvoid_va_end_ms =
29886 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
29887 fnvoid_va_start_ms =
29888 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
29889 fnvoid_va_end_sysv =
29890 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
29891 fnvoid_va_start_sysv =
29892 build_varargs_function_type_list (void_type_node, sysv_va_ref,
29893 NULL_TREE);
29894 fnvoid_va_copy_ms =
29895 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
29896 NULL_TREE);
29897 fnvoid_va_copy_sysv =
29898 build_function_type_list (void_type_node, sysv_va_ref,
29899 sysv_va_ref, NULL_TREE);
29901 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
29902 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
29903 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
29904 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
29905 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
29906 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
29907 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
29908 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
29909 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
29910 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
29911 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
29912 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
29915 static void
29916 ix86_init_builtin_types (void)
29918 tree float128_type_node, float80_type_node;
29920 /* The __float80 type. */
29921 float80_type_node = long_double_type_node;
29922 if (TYPE_MODE (float80_type_node) != XFmode)
29924 /* The __float80 type. */
29925 float80_type_node = make_node (REAL_TYPE);
29927 TYPE_PRECISION (float80_type_node) = 80;
29928 layout_type (float80_type_node);
29930 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
29932 /* The __float128 type. */
29933 float128_type_node = make_node (REAL_TYPE);
29934 TYPE_PRECISION (float128_type_node) = 128;
29935 layout_type (float128_type_node);
29936 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
29938 /* This macro is built by i386-builtin-types.awk. */
29939 DEFINE_BUILTIN_PRIMITIVE_TYPES;
29942 static void
29943 ix86_init_builtins (void)
29945 tree t;
29947 ix86_init_builtin_types ();
29949 /* Builtins to get CPU type and features. */
29950 ix86_init_platform_type_builtins ();
29952 /* TFmode support builtins. */
29953 def_builtin_const (0, "__builtin_infq",
29954 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
29955 def_builtin_const (0, "__builtin_huge_valq",
29956 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
29958 /* We will expand them to normal call if SSE isn't available since
29959 they are used by libgcc. */
29960 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
29961 t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
29962 BUILT_IN_MD, "__fabstf2", NULL_TREE);
29963 TREE_READONLY (t) = 1;
29964 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
29966 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
29967 t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
29968 BUILT_IN_MD, "__copysigntf3", NULL_TREE);
29969 TREE_READONLY (t) = 1;
29970 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
29972 ix86_init_tm_builtins ();
29973 ix86_init_mmx_sse_builtins ();
29975 if (TARGET_LP64)
29976 ix86_init_builtins_va_builtins_abi ();
29978 #ifdef SUBTARGET_INIT_BUILTINS
29979 SUBTARGET_INIT_BUILTINS;
29980 #endif
29983 /* Return the ix86 builtin for CODE. */
29985 static tree
29986 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
29988 if (code >= IX86_BUILTIN_MAX)
29989 return error_mark_node;
29991 return ix86_builtins[code];
29994 /* Errors in the source file can cause expand_expr to return const0_rtx
29995 where we expect a vector. To avoid crashing, use one of the vector
29996 clear instructions. */
29997 static rtx
29998 safe_vector_operand (rtx x, enum machine_mode mode)
30000 if (x == const0_rtx)
30001 x = CONST0_RTX (mode);
30002 return x;
30005 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
30007 static rtx
30008 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
30010 rtx pat;
30011 tree arg0 = CALL_EXPR_ARG (exp, 0);
30012 tree arg1 = CALL_EXPR_ARG (exp, 1);
30013 rtx op0 = expand_normal (arg0);
30014 rtx op1 = expand_normal (arg1);
30015 enum machine_mode tmode = insn_data[icode].operand[0].mode;
30016 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
30017 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
30019 if (VECTOR_MODE_P (mode0))
30020 op0 = safe_vector_operand (op0, mode0);
30021 if (VECTOR_MODE_P (mode1))
30022 op1 = safe_vector_operand (op1, mode1);
30024 if (optimize || !target
30025 || GET_MODE (target) != tmode
30026 || !insn_data[icode].operand[0].predicate (target, tmode))
30027 target = gen_reg_rtx (tmode);
30029 if (GET_MODE (op1) == SImode && mode1 == TImode)
30031 rtx x = gen_reg_rtx (V4SImode);
30032 emit_insn (gen_sse2_loadd (x, op1));
30033 op1 = gen_lowpart (TImode, x);
30036 if (!insn_data[icode].operand[1].predicate (op0, mode0))
30037 op0 = copy_to_mode_reg (mode0, op0);
30038 if (!insn_data[icode].operand[2].predicate (op1, mode1))
30039 op1 = copy_to_mode_reg (mode1, op1);
30041 pat = GEN_FCN (icode) (target, op0, op1);
30042 if (! pat)
30043 return 0;
30045 emit_insn (pat);
30047 return target;
30050 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
30052 static rtx
30053 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
30054 enum ix86_builtin_func_type m_type,
30055 enum rtx_code sub_code)
30057 rtx pat;
30058 int i;
30059 int nargs;
30060 bool comparison_p = false;
30061 bool tf_p = false;
30062 bool last_arg_constant = false;
30063 int num_memory = 0;
30064 struct {
30065 rtx op;
30066 enum machine_mode mode;
30067 } args[4];
30069 enum machine_mode tmode = insn_data[icode].operand[0].mode;
30071 switch (m_type)
30073 case MULTI_ARG_4_DF2_DI_I:
30074 case MULTI_ARG_4_DF2_DI_I1:
30075 case MULTI_ARG_4_SF2_SI_I:
30076 case MULTI_ARG_4_SF2_SI_I1:
30077 nargs = 4;
30078 last_arg_constant = true;
30079 break;
30081 case MULTI_ARG_3_SF:
30082 case MULTI_ARG_3_DF:
30083 case MULTI_ARG_3_SF2:
30084 case MULTI_ARG_3_DF2:
30085 case MULTI_ARG_3_DI:
30086 case MULTI_ARG_3_SI:
30087 case MULTI_ARG_3_SI_DI:
30088 case MULTI_ARG_3_HI:
30089 case MULTI_ARG_3_HI_SI:
30090 case MULTI_ARG_3_QI:
30091 case MULTI_ARG_3_DI2:
30092 case MULTI_ARG_3_SI2:
30093 case MULTI_ARG_3_HI2:
30094 case MULTI_ARG_3_QI2:
30095 nargs = 3;
30096 break;
30098 case MULTI_ARG_2_SF:
30099 case MULTI_ARG_2_DF:
30100 case MULTI_ARG_2_DI:
30101 case MULTI_ARG_2_SI:
30102 case MULTI_ARG_2_HI:
30103 case MULTI_ARG_2_QI:
30104 nargs = 2;
30105 break;
30107 case MULTI_ARG_2_DI_IMM:
30108 case MULTI_ARG_2_SI_IMM:
30109 case MULTI_ARG_2_HI_IMM:
30110 case MULTI_ARG_2_QI_IMM:
30111 nargs = 2;
30112 last_arg_constant = true;
30113 break;
30115 case MULTI_ARG_1_SF:
30116 case MULTI_ARG_1_DF:
30117 case MULTI_ARG_1_SF2:
30118 case MULTI_ARG_1_DF2:
30119 case MULTI_ARG_1_DI:
30120 case MULTI_ARG_1_SI:
30121 case MULTI_ARG_1_HI:
30122 case MULTI_ARG_1_QI:
30123 case MULTI_ARG_1_SI_DI:
30124 case MULTI_ARG_1_HI_DI:
30125 case MULTI_ARG_1_HI_SI:
30126 case MULTI_ARG_1_QI_DI:
30127 case MULTI_ARG_1_QI_SI:
30128 case MULTI_ARG_1_QI_HI:
30129 nargs = 1;
30130 break;
30132 case MULTI_ARG_2_DI_CMP:
30133 case MULTI_ARG_2_SI_CMP:
30134 case MULTI_ARG_2_HI_CMP:
30135 case MULTI_ARG_2_QI_CMP:
30136 nargs = 2;
30137 comparison_p = true;
30138 break;
30140 case MULTI_ARG_2_SF_TF:
30141 case MULTI_ARG_2_DF_TF:
30142 case MULTI_ARG_2_DI_TF:
30143 case MULTI_ARG_2_SI_TF:
30144 case MULTI_ARG_2_HI_TF:
30145 case MULTI_ARG_2_QI_TF:
30146 nargs = 2;
30147 tf_p = true;
30148 break;
30150 default:
30151 gcc_unreachable ();
30154 if (optimize || !target
30155 || GET_MODE (target) != tmode
30156 || !insn_data[icode].operand[0].predicate (target, tmode))
30157 target = gen_reg_rtx (tmode);
30159 gcc_assert (nargs <= 4);
30161 for (i = 0; i < nargs; i++)
30163 tree arg = CALL_EXPR_ARG (exp, i);
30164 rtx op = expand_normal (arg);
30165 int adjust = (comparison_p) ? 1 : 0;
30166 enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
30168 if (last_arg_constant && i == nargs - 1)
30170 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
30172 enum insn_code new_icode = icode;
30173 switch (icode)
30175 case CODE_FOR_xop_vpermil2v2df3:
30176 case CODE_FOR_xop_vpermil2v4sf3:
30177 case CODE_FOR_xop_vpermil2v4df3:
30178 case CODE_FOR_xop_vpermil2v8sf3:
30179 error ("the last argument must be a 2-bit immediate");
30180 return gen_reg_rtx (tmode);
30181 case CODE_FOR_xop_rotlv2di3:
30182 new_icode = CODE_FOR_rotlv2di3;
30183 goto xop_rotl;
30184 case CODE_FOR_xop_rotlv4si3:
30185 new_icode = CODE_FOR_rotlv4si3;
30186 goto xop_rotl;
30187 case CODE_FOR_xop_rotlv8hi3:
30188 new_icode = CODE_FOR_rotlv8hi3;
30189 goto xop_rotl;
30190 case CODE_FOR_xop_rotlv16qi3:
30191 new_icode = CODE_FOR_rotlv16qi3;
30192 xop_rotl:
30193 if (CONST_INT_P (op))
30195 int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
30196 op = GEN_INT (INTVAL (op) & mask);
30197 gcc_checking_assert
30198 (insn_data[icode].operand[i + 1].predicate (op, mode));
30200 else
30202 gcc_checking_assert
30203 (nargs == 2
30204 && insn_data[new_icode].operand[0].mode == tmode
30205 && insn_data[new_icode].operand[1].mode == tmode
30206 && insn_data[new_icode].operand[2].mode == mode
30207 && insn_data[new_icode].operand[0].predicate
30208 == insn_data[icode].operand[0].predicate
30209 && insn_data[new_icode].operand[1].predicate
30210 == insn_data[icode].operand[1].predicate);
30211 icode = new_icode;
30212 goto non_constant;
30214 break;
30215 default:
30216 gcc_unreachable ();
30220 else
30222 non_constant:
30223 if (VECTOR_MODE_P (mode))
30224 op = safe_vector_operand (op, mode);
30226 /* If we aren't optimizing, only allow one memory operand to be
30227 generated. */
30228 if (memory_operand (op, mode))
30229 num_memory++;
30231 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
30233 if (optimize
30234 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
30235 || num_memory > 1)
30236 op = force_reg (mode, op);
30239 args[i].op = op;
30240 args[i].mode = mode;
30243 switch (nargs)
30245 case 1:
30246 pat = GEN_FCN (icode) (target, args[0].op);
30247 break;
30249 case 2:
30250 if (tf_p)
30251 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
30252 GEN_INT ((int)sub_code));
30253 else if (! comparison_p)
30254 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
30255 else
30257 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
30258 args[0].op,
30259 args[1].op);
30261 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
30263 break;
30265 case 3:
30266 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
30267 break;
30269 case 4:
30270 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
30271 break;
30273 default:
30274 gcc_unreachable ();
30277 if (! pat)
30278 return 0;
30280 emit_insn (pat);
30281 return target;
30284 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
30285 insns with vec_merge. */
30287 static rtx
30288 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
30289 rtx target)
30291 rtx pat;
30292 tree arg0 = CALL_EXPR_ARG (exp, 0);
30293 rtx op1, op0 = expand_normal (arg0);
30294 enum machine_mode tmode = insn_data[icode].operand[0].mode;
30295 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
30297 if (optimize || !target
30298 || GET_MODE (target) != tmode
30299 || !insn_data[icode].operand[0].predicate (target, tmode))
30300 target = gen_reg_rtx (tmode);
30302 if (VECTOR_MODE_P (mode0))
30303 op0 = safe_vector_operand (op0, mode0);
30305 if ((optimize && !register_operand (op0, mode0))
30306 || !insn_data[icode].operand[1].predicate (op0, mode0))
30307 op0 = copy_to_mode_reg (mode0, op0);
30309 op1 = op0;
30310 if (!insn_data[icode].operand[2].predicate (op1, mode0))
30311 op1 = copy_to_mode_reg (mode0, op1);
30313 pat = GEN_FCN (icode) (target, op0, op1);
30314 if (! pat)
30315 return 0;
30316 emit_insn (pat);
30317 return target;
30320 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
30322 static rtx
30323 ix86_expand_sse_compare (const struct builtin_description *d,
30324 tree exp, rtx target, bool swap)
30326 rtx pat;
30327 tree arg0 = CALL_EXPR_ARG (exp, 0);
30328 tree arg1 = CALL_EXPR_ARG (exp, 1);
30329 rtx op0 = expand_normal (arg0);
30330 rtx op1 = expand_normal (arg1);
30331 rtx op2;
30332 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
30333 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
30334 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
30335 enum rtx_code comparison = d->comparison;
30337 if (VECTOR_MODE_P (mode0))
30338 op0 = safe_vector_operand (op0, mode0);
30339 if (VECTOR_MODE_P (mode1))
30340 op1 = safe_vector_operand (op1, mode1);
30342 /* Swap operands if we have a comparison that isn't available in
30343 hardware. */
30344 if (swap)
30346 rtx tmp = gen_reg_rtx (mode1);
30347 emit_move_insn (tmp, op1);
30348 op1 = op0;
30349 op0 = tmp;
30352 if (optimize || !target
30353 || GET_MODE (target) != tmode
30354 || !insn_data[d->icode].operand[0].predicate (target, tmode))
30355 target = gen_reg_rtx (tmode);
30357 if ((optimize && !register_operand (op0, mode0))
30358 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
30359 op0 = copy_to_mode_reg (mode0, op0);
30360 if ((optimize && !register_operand (op1, mode1))
30361 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
30362 op1 = copy_to_mode_reg (mode1, op1);
30364 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
30365 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
30366 if (! pat)
30367 return 0;
30368 emit_insn (pat);
30369 return target;
30372 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
30374 static rtx
30375 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
30376 rtx target)
30378 rtx pat;
30379 tree arg0 = CALL_EXPR_ARG (exp, 0);
30380 tree arg1 = CALL_EXPR_ARG (exp, 1);
30381 rtx op0 = expand_normal (arg0);
30382 rtx op1 = expand_normal (arg1);
30383 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
30384 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
30385 enum rtx_code comparison = d->comparison;
30387 if (VECTOR_MODE_P (mode0))
30388 op0 = safe_vector_operand (op0, mode0);
30389 if (VECTOR_MODE_P (mode1))
30390 op1 = safe_vector_operand (op1, mode1);
30392 /* Swap operands if we have a comparison that isn't available in
30393 hardware. */
30394 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
30396 rtx tmp = op1;
30397 op1 = op0;
30398 op0 = tmp;
30401 target = gen_reg_rtx (SImode);
30402 emit_move_insn (target, const0_rtx);
30403 target = gen_rtx_SUBREG (QImode, target, 0);
30405 if ((optimize && !register_operand (op0, mode0))
30406 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30407 op0 = copy_to_mode_reg (mode0, op0);
30408 if ((optimize && !register_operand (op1, mode1))
30409 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
30410 op1 = copy_to_mode_reg (mode1, op1);
30412 pat = GEN_FCN (d->icode) (op0, op1);
30413 if (! pat)
30414 return 0;
30415 emit_insn (pat);
30416 emit_insn (gen_rtx_SET (VOIDmode,
30417 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30418 gen_rtx_fmt_ee (comparison, QImode,
30419 SET_DEST (pat),
30420 const0_rtx)));
30422 return SUBREG_REG (target);
30425 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
30427 static rtx
30428 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
30429 rtx target)
30431 rtx pat;
30432 tree arg0 = CALL_EXPR_ARG (exp, 0);
30433 rtx op1, op0 = expand_normal (arg0);
30434 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
30435 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
30437 if (optimize || target == 0
30438 || GET_MODE (target) != tmode
30439 || !insn_data[d->icode].operand[0].predicate (target, tmode))
30440 target = gen_reg_rtx (tmode);
30442 if (VECTOR_MODE_P (mode0))
30443 op0 = safe_vector_operand (op0, mode0);
30445 if ((optimize && !register_operand (op0, mode0))
30446 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30447 op0 = copy_to_mode_reg (mode0, op0);
30449 op1 = GEN_INT (d->comparison);
30451 pat = GEN_FCN (d->icode) (target, op0, op1);
30452 if (! pat)
30453 return 0;
30454 emit_insn (pat);
30455 return target;
30458 static rtx
30459 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
30460 tree exp, rtx target)
30462 rtx pat;
30463 tree arg0 = CALL_EXPR_ARG (exp, 0);
30464 tree arg1 = CALL_EXPR_ARG (exp, 1);
30465 rtx op0 = expand_normal (arg0);
30466 rtx op1 = expand_normal (arg1);
30467 rtx op2;
30468 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
30469 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
30470 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
30472 if (optimize || target == 0
30473 || GET_MODE (target) != tmode
30474 || !insn_data[d->icode].operand[0].predicate (target, tmode))
30475 target = gen_reg_rtx (tmode);
30477 op0 = safe_vector_operand (op0, mode0);
30478 op1 = safe_vector_operand (op1, mode1);
30480 if ((optimize && !register_operand (op0, mode0))
30481 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30482 op0 = copy_to_mode_reg (mode0, op0);
30483 if ((optimize && !register_operand (op1, mode1))
30484 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
30485 op1 = copy_to_mode_reg (mode1, op1);
30487 op2 = GEN_INT (d->comparison);
30489 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
30490 if (! pat)
30491 return 0;
30492 emit_insn (pat);
30493 return target;
30496 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
30498 static rtx
30499 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
30500 rtx target)
30502 rtx pat;
30503 tree arg0 = CALL_EXPR_ARG (exp, 0);
30504 tree arg1 = CALL_EXPR_ARG (exp, 1);
30505 rtx op0 = expand_normal (arg0);
30506 rtx op1 = expand_normal (arg1);
30507 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
30508 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
30509 enum rtx_code comparison = d->comparison;
30511 if (VECTOR_MODE_P (mode0))
30512 op0 = safe_vector_operand (op0, mode0);
30513 if (VECTOR_MODE_P (mode1))
30514 op1 = safe_vector_operand (op1, mode1);
30516 target = gen_reg_rtx (SImode);
30517 emit_move_insn (target, const0_rtx);
30518 target = gen_rtx_SUBREG (QImode, target, 0);
30520 if ((optimize && !register_operand (op0, mode0))
30521 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30522 op0 = copy_to_mode_reg (mode0, op0);
30523 if ((optimize && !register_operand (op1, mode1))
30524 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
30525 op1 = copy_to_mode_reg (mode1, op1);
30527 pat = GEN_FCN (d->icode) (op0, op1);
30528 if (! pat)
30529 return 0;
30530 emit_insn (pat);
30531 emit_insn (gen_rtx_SET (VOIDmode,
30532 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30533 gen_rtx_fmt_ee (comparison, QImode,
30534 SET_DEST (pat),
30535 const0_rtx)));
30537 return SUBREG_REG (target);
30540 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
30542 static rtx
30543 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
30544 tree exp, rtx target)
30546 rtx pat;
30547 tree arg0 = CALL_EXPR_ARG (exp, 0);
30548 tree arg1 = CALL_EXPR_ARG (exp, 1);
30549 tree arg2 = CALL_EXPR_ARG (exp, 2);
30550 tree arg3 = CALL_EXPR_ARG (exp, 3);
30551 tree arg4 = CALL_EXPR_ARG (exp, 4);
30552 rtx scratch0, scratch1;
30553 rtx op0 = expand_normal (arg0);
30554 rtx op1 = expand_normal (arg1);
30555 rtx op2 = expand_normal (arg2);
30556 rtx op3 = expand_normal (arg3);
30557 rtx op4 = expand_normal (arg4);
30558 enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
30560 tmode0 = insn_data[d->icode].operand[0].mode;
30561 tmode1 = insn_data[d->icode].operand[1].mode;
30562 modev2 = insn_data[d->icode].operand[2].mode;
30563 modei3 = insn_data[d->icode].operand[3].mode;
30564 modev4 = insn_data[d->icode].operand[4].mode;
30565 modei5 = insn_data[d->icode].operand[5].mode;
30566 modeimm = insn_data[d->icode].operand[6].mode;
30568 if (VECTOR_MODE_P (modev2))
30569 op0 = safe_vector_operand (op0, modev2);
30570 if (VECTOR_MODE_P (modev4))
30571 op2 = safe_vector_operand (op2, modev4);
30573 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
30574 op0 = copy_to_mode_reg (modev2, op0);
30575 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
30576 op1 = copy_to_mode_reg (modei3, op1);
30577 if ((optimize && !register_operand (op2, modev4))
30578 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
30579 op2 = copy_to_mode_reg (modev4, op2);
30580 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
30581 op3 = copy_to_mode_reg (modei5, op3);
30583 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
30585 error ("the fifth argument must be an 8-bit immediate");
30586 return const0_rtx;
30589 if (d->code == IX86_BUILTIN_PCMPESTRI128)
30591 if (optimize || !target
30592 || GET_MODE (target) != tmode0
30593 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
30594 target = gen_reg_rtx (tmode0);
30596 scratch1 = gen_reg_rtx (tmode1);
30598 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
30600 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
30602 if (optimize || !target
30603 || GET_MODE (target) != tmode1
30604 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
30605 target = gen_reg_rtx (tmode1);
30607 scratch0 = gen_reg_rtx (tmode0);
30609 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
30611 else
30613 gcc_assert (d->flag);
30615 scratch0 = gen_reg_rtx (tmode0);
30616 scratch1 = gen_reg_rtx (tmode1);
30618 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
30621 if (! pat)
30622 return 0;
30624 emit_insn (pat);
30626 if (d->flag)
30628 target = gen_reg_rtx (SImode);
30629 emit_move_insn (target, const0_rtx);
30630 target = gen_rtx_SUBREG (QImode, target, 0);
30632 emit_insn
30633 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30634 gen_rtx_fmt_ee (EQ, QImode,
30635 gen_rtx_REG ((enum machine_mode) d->flag,
30636 FLAGS_REG),
30637 const0_rtx)));
30638 return SUBREG_REG (target);
30640 else
30641 return target;
30645 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
30647 static rtx
30648 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
30649 tree exp, rtx target)
30651 rtx pat;
30652 tree arg0 = CALL_EXPR_ARG (exp, 0);
30653 tree arg1 = CALL_EXPR_ARG (exp, 1);
30654 tree arg2 = CALL_EXPR_ARG (exp, 2);
30655 rtx scratch0, scratch1;
30656 rtx op0 = expand_normal (arg0);
30657 rtx op1 = expand_normal (arg1);
30658 rtx op2 = expand_normal (arg2);
30659 enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
30661 tmode0 = insn_data[d->icode].operand[0].mode;
30662 tmode1 = insn_data[d->icode].operand[1].mode;
30663 modev2 = insn_data[d->icode].operand[2].mode;
30664 modev3 = insn_data[d->icode].operand[3].mode;
30665 modeimm = insn_data[d->icode].operand[4].mode;
30667 if (VECTOR_MODE_P (modev2))
30668 op0 = safe_vector_operand (op0, modev2);
30669 if (VECTOR_MODE_P (modev3))
30670 op1 = safe_vector_operand (op1, modev3);
30672 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
30673 op0 = copy_to_mode_reg (modev2, op0);
30674 if ((optimize && !register_operand (op1, modev3))
30675 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
30676 op1 = copy_to_mode_reg (modev3, op1);
30678 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
30680 error ("the third argument must be an 8-bit immediate");
30681 return const0_rtx;
30684 if (d->code == IX86_BUILTIN_PCMPISTRI128)
30686 if (optimize || !target
30687 || GET_MODE (target) != tmode0
30688 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
30689 target = gen_reg_rtx (tmode0);
30691 scratch1 = gen_reg_rtx (tmode1);
30693 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
30695 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
30697 if (optimize || !target
30698 || GET_MODE (target) != tmode1
30699 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
30700 target = gen_reg_rtx (tmode1);
30702 scratch0 = gen_reg_rtx (tmode0);
30704 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
30706 else
30708 gcc_assert (d->flag);
30710 scratch0 = gen_reg_rtx (tmode0);
30711 scratch1 = gen_reg_rtx (tmode1);
30713 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
30716 if (! pat)
30717 return 0;
30719 emit_insn (pat);
30721 if (d->flag)
30723 target = gen_reg_rtx (SImode);
30724 emit_move_insn (target, const0_rtx);
30725 target = gen_rtx_SUBREG (QImode, target, 0);
30727 emit_insn
30728 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30729 gen_rtx_fmt_ee (EQ, QImode,
30730 gen_rtx_REG ((enum machine_mode) d->flag,
30731 FLAGS_REG),
30732 const0_rtx)));
30733 return SUBREG_REG (target);
30735 else
30736 return target;
30739 /* Subroutine of ix86_expand_builtin to take care of insns with
30740 variable number of operands. */
30742 static rtx
30743 ix86_expand_args_builtin (const struct builtin_description *d,
30744 tree exp, rtx target)
30746 rtx pat, real_target;
30747 unsigned int i, nargs;
30748 unsigned int nargs_constant = 0;
30749 int num_memory = 0;
30750 struct
30752 rtx op;
30753 enum machine_mode mode;
30754 } args[4];
30755 bool last_arg_count = false;
30756 enum insn_code icode = d->icode;
30757 const struct insn_data_d *insn_p = &insn_data[icode];
30758 enum machine_mode tmode = insn_p->operand[0].mode;
30759 enum machine_mode rmode = VOIDmode;
30760 bool swap = false;
30761 enum rtx_code comparison = d->comparison;
30763 switch ((enum ix86_builtin_func_type) d->flag)
30765 case V2DF_FTYPE_V2DF_ROUND:
30766 case V4DF_FTYPE_V4DF_ROUND:
30767 case V4SF_FTYPE_V4SF_ROUND:
30768 case V8SF_FTYPE_V8SF_ROUND:
30769 case V4SI_FTYPE_V4SF_ROUND:
30770 case V8SI_FTYPE_V8SF_ROUND:
30771 return ix86_expand_sse_round (d, exp, target);
30772 case V4SI_FTYPE_V2DF_V2DF_ROUND:
30773 case V8SI_FTYPE_V4DF_V4DF_ROUND:
30774 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
30775 case INT_FTYPE_V8SF_V8SF_PTEST:
30776 case INT_FTYPE_V4DI_V4DI_PTEST:
30777 case INT_FTYPE_V4DF_V4DF_PTEST:
30778 case INT_FTYPE_V4SF_V4SF_PTEST:
30779 case INT_FTYPE_V2DI_V2DI_PTEST:
30780 case INT_FTYPE_V2DF_V2DF_PTEST:
30781 return ix86_expand_sse_ptest (d, exp, target);
30782 case FLOAT128_FTYPE_FLOAT128:
30783 case FLOAT_FTYPE_FLOAT:
30784 case INT_FTYPE_INT:
30785 case UINT64_FTYPE_INT:
30786 case UINT16_FTYPE_UINT16:
30787 case INT64_FTYPE_INT64:
30788 case INT64_FTYPE_V4SF:
30789 case INT64_FTYPE_V2DF:
30790 case INT_FTYPE_V16QI:
30791 case INT_FTYPE_V8QI:
30792 case INT_FTYPE_V8SF:
30793 case INT_FTYPE_V4DF:
30794 case INT_FTYPE_V4SF:
30795 case INT_FTYPE_V2DF:
30796 case INT_FTYPE_V32QI:
30797 case V16QI_FTYPE_V16QI:
30798 case V8SI_FTYPE_V8SF:
30799 case V8SI_FTYPE_V4SI:
30800 case V8HI_FTYPE_V8HI:
30801 case V8HI_FTYPE_V16QI:
30802 case V8QI_FTYPE_V8QI:
30803 case V8SF_FTYPE_V8SF:
30804 case V8SF_FTYPE_V8SI:
30805 case V8SF_FTYPE_V4SF:
30806 case V8SF_FTYPE_V8HI:
30807 case V4SI_FTYPE_V4SI:
30808 case V4SI_FTYPE_V16QI:
30809 case V4SI_FTYPE_V4SF:
30810 case V4SI_FTYPE_V8SI:
30811 case V4SI_FTYPE_V8HI:
30812 case V4SI_FTYPE_V4DF:
30813 case V4SI_FTYPE_V2DF:
30814 case V4HI_FTYPE_V4HI:
30815 case V4DF_FTYPE_V4DF:
30816 case V4DF_FTYPE_V4SI:
30817 case V4DF_FTYPE_V4SF:
30818 case V4DF_FTYPE_V2DF:
30819 case V4SF_FTYPE_V4SF:
30820 case V4SF_FTYPE_V4SI:
30821 case V4SF_FTYPE_V8SF:
30822 case V4SF_FTYPE_V4DF:
30823 case V4SF_FTYPE_V8HI:
30824 case V4SF_FTYPE_V2DF:
30825 case V2DI_FTYPE_V2DI:
30826 case V2DI_FTYPE_V16QI:
30827 case V2DI_FTYPE_V8HI:
30828 case V2DI_FTYPE_V4SI:
30829 case V2DF_FTYPE_V2DF:
30830 case V2DF_FTYPE_V4SI:
30831 case V2DF_FTYPE_V4DF:
30832 case V2DF_FTYPE_V4SF:
30833 case V2DF_FTYPE_V2SI:
30834 case V2SI_FTYPE_V2SI:
30835 case V2SI_FTYPE_V4SF:
30836 case V2SI_FTYPE_V2SF:
30837 case V2SI_FTYPE_V2DF:
30838 case V2SF_FTYPE_V2SF:
30839 case V2SF_FTYPE_V2SI:
30840 case V32QI_FTYPE_V32QI:
30841 case V32QI_FTYPE_V16QI:
30842 case V16HI_FTYPE_V16HI:
30843 case V16HI_FTYPE_V8HI:
30844 case V8SI_FTYPE_V8SI:
30845 case V16HI_FTYPE_V16QI:
30846 case V8SI_FTYPE_V16QI:
30847 case V4DI_FTYPE_V16QI:
30848 case V8SI_FTYPE_V8HI:
30849 case V4DI_FTYPE_V8HI:
30850 case V4DI_FTYPE_V4SI:
30851 case V4DI_FTYPE_V2DI:
30852 nargs = 1;
30853 break;
30854 case V4SF_FTYPE_V4SF_VEC_MERGE:
30855 case V2DF_FTYPE_V2DF_VEC_MERGE:
30856 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
30857 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
30858 case V16QI_FTYPE_V16QI_V16QI:
30859 case V16QI_FTYPE_V8HI_V8HI:
30860 case V8QI_FTYPE_V8QI_V8QI:
30861 case V8QI_FTYPE_V4HI_V4HI:
30862 case V8HI_FTYPE_V8HI_V8HI:
30863 case V8HI_FTYPE_V16QI_V16QI:
30864 case V8HI_FTYPE_V4SI_V4SI:
30865 case V8SF_FTYPE_V8SF_V8SF:
30866 case V8SF_FTYPE_V8SF_V8SI:
30867 case V4SI_FTYPE_V4SI_V4SI:
30868 case V4SI_FTYPE_V8HI_V8HI:
30869 case V4SI_FTYPE_V4SF_V4SF:
30870 case V4SI_FTYPE_V2DF_V2DF:
30871 case V4HI_FTYPE_V4HI_V4HI:
30872 case V4HI_FTYPE_V8QI_V8QI:
30873 case V4HI_FTYPE_V2SI_V2SI:
30874 case V4DF_FTYPE_V4DF_V4DF:
30875 case V4DF_FTYPE_V4DF_V4DI:
30876 case V4SF_FTYPE_V4SF_V4SF:
30877 case V4SF_FTYPE_V4SF_V4SI:
30878 case V4SF_FTYPE_V4SF_V2SI:
30879 case V4SF_FTYPE_V4SF_V2DF:
30880 case V4SF_FTYPE_V4SF_DI:
30881 case V4SF_FTYPE_V4SF_SI:
30882 case V2DI_FTYPE_V2DI_V2DI:
30883 case V2DI_FTYPE_V16QI_V16QI:
30884 case V2DI_FTYPE_V4SI_V4SI:
30885 case V2UDI_FTYPE_V4USI_V4USI:
30886 case V2DI_FTYPE_V2DI_V16QI:
30887 case V2DI_FTYPE_V2DF_V2DF:
30888 case V2SI_FTYPE_V2SI_V2SI:
30889 case V2SI_FTYPE_V4HI_V4HI:
30890 case V2SI_FTYPE_V2SF_V2SF:
30891 case V2DF_FTYPE_V2DF_V2DF:
30892 case V2DF_FTYPE_V2DF_V4SF:
30893 case V2DF_FTYPE_V2DF_V2DI:
30894 case V2DF_FTYPE_V2DF_DI:
30895 case V2DF_FTYPE_V2DF_SI:
30896 case V2SF_FTYPE_V2SF_V2SF:
30897 case V1DI_FTYPE_V1DI_V1DI:
30898 case V1DI_FTYPE_V8QI_V8QI:
30899 case V1DI_FTYPE_V2SI_V2SI:
30900 case V32QI_FTYPE_V16HI_V16HI:
30901 case V16HI_FTYPE_V8SI_V8SI:
30902 case V32QI_FTYPE_V32QI_V32QI:
30903 case V16HI_FTYPE_V32QI_V32QI:
30904 case V16HI_FTYPE_V16HI_V16HI:
30905 case V8SI_FTYPE_V4DF_V4DF:
30906 case V8SI_FTYPE_V8SI_V8SI:
30907 case V8SI_FTYPE_V16HI_V16HI:
30908 case V4DI_FTYPE_V4DI_V4DI:
30909 case V4DI_FTYPE_V8SI_V8SI:
30910 case V4UDI_FTYPE_V8USI_V8USI:
30911 if (comparison == UNKNOWN)
30912 return ix86_expand_binop_builtin (icode, exp, target);
30913 nargs = 2;
30914 break;
30915 case V4SF_FTYPE_V4SF_V4SF_SWAP:
30916 case V2DF_FTYPE_V2DF_V2DF_SWAP:
30917 gcc_assert (comparison != UNKNOWN);
30918 nargs = 2;
30919 swap = true;
30920 break;
30921 case V16HI_FTYPE_V16HI_V8HI_COUNT:
30922 case V16HI_FTYPE_V16HI_SI_COUNT:
30923 case V8SI_FTYPE_V8SI_V4SI_COUNT:
30924 case V8SI_FTYPE_V8SI_SI_COUNT:
30925 case V4DI_FTYPE_V4DI_V2DI_COUNT:
30926 case V4DI_FTYPE_V4DI_INT_COUNT:
30927 case V8HI_FTYPE_V8HI_V8HI_COUNT:
30928 case V8HI_FTYPE_V8HI_SI_COUNT:
30929 case V4SI_FTYPE_V4SI_V4SI_COUNT:
30930 case V4SI_FTYPE_V4SI_SI_COUNT:
30931 case V4HI_FTYPE_V4HI_V4HI_COUNT:
30932 case V4HI_FTYPE_V4HI_SI_COUNT:
30933 case V2DI_FTYPE_V2DI_V2DI_COUNT:
30934 case V2DI_FTYPE_V2DI_SI_COUNT:
30935 case V2SI_FTYPE_V2SI_V2SI_COUNT:
30936 case V2SI_FTYPE_V2SI_SI_COUNT:
30937 case V1DI_FTYPE_V1DI_V1DI_COUNT:
30938 case V1DI_FTYPE_V1DI_SI_COUNT:
30939 nargs = 2;
30940 last_arg_count = true;
30941 break;
30942 case UINT64_FTYPE_UINT64_UINT64:
30943 case UINT_FTYPE_UINT_UINT:
30944 case UINT_FTYPE_UINT_USHORT:
30945 case UINT_FTYPE_UINT_UCHAR:
30946 case UINT16_FTYPE_UINT16_INT:
30947 case UINT8_FTYPE_UINT8_INT:
30948 nargs = 2;
30949 break;
30950 case V2DI_FTYPE_V2DI_INT_CONVERT:
30951 nargs = 2;
30952 rmode = V1TImode;
30953 nargs_constant = 1;
30954 break;
30955 case V4DI_FTYPE_V4DI_INT_CONVERT:
30956 nargs = 2;
30957 rmode = V2TImode;
30958 nargs_constant = 1;
30959 break;
30960 case V8HI_FTYPE_V8HI_INT:
30961 case V8HI_FTYPE_V8SF_INT:
30962 case V8HI_FTYPE_V4SF_INT:
30963 case V8SF_FTYPE_V8SF_INT:
30964 case V4SI_FTYPE_V4SI_INT:
30965 case V4SI_FTYPE_V8SI_INT:
30966 case V4HI_FTYPE_V4HI_INT:
30967 case V4DF_FTYPE_V4DF_INT:
30968 case V4SF_FTYPE_V4SF_INT:
30969 case V4SF_FTYPE_V8SF_INT:
30970 case V2DI_FTYPE_V2DI_INT:
30971 case V2DF_FTYPE_V2DF_INT:
30972 case V2DF_FTYPE_V4DF_INT:
30973 case V16HI_FTYPE_V16HI_INT:
30974 case V8SI_FTYPE_V8SI_INT:
30975 case V4DI_FTYPE_V4DI_INT:
30976 case V2DI_FTYPE_V4DI_INT:
30977 nargs = 2;
30978 nargs_constant = 1;
30979 break;
30980 case V16QI_FTYPE_V16QI_V16QI_V16QI:
30981 case V8SF_FTYPE_V8SF_V8SF_V8SF:
30982 case V4DF_FTYPE_V4DF_V4DF_V4DF:
30983 case V4SF_FTYPE_V4SF_V4SF_V4SF:
30984 case V2DF_FTYPE_V2DF_V2DF_V2DF:
30985 case V32QI_FTYPE_V32QI_V32QI_V32QI:
30986 nargs = 3;
30987 break;
30988 case V32QI_FTYPE_V32QI_V32QI_INT:
30989 case V16HI_FTYPE_V16HI_V16HI_INT:
30990 case V16QI_FTYPE_V16QI_V16QI_INT:
30991 case V4DI_FTYPE_V4DI_V4DI_INT:
30992 case V8HI_FTYPE_V8HI_V8HI_INT:
30993 case V8SI_FTYPE_V8SI_V8SI_INT:
30994 case V8SI_FTYPE_V8SI_V4SI_INT:
30995 case V8SF_FTYPE_V8SF_V8SF_INT:
30996 case V8SF_FTYPE_V8SF_V4SF_INT:
30997 case V4SI_FTYPE_V4SI_V4SI_INT:
30998 case V4DF_FTYPE_V4DF_V4DF_INT:
30999 case V4DF_FTYPE_V4DF_V2DF_INT:
31000 case V4SF_FTYPE_V4SF_V4SF_INT:
31001 case V2DI_FTYPE_V2DI_V2DI_INT:
31002 case V4DI_FTYPE_V4DI_V2DI_INT:
31003 case V2DF_FTYPE_V2DF_V2DF_INT:
31004 nargs = 3;
31005 nargs_constant = 1;
31006 break;
31007 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
31008 nargs = 3;
31009 rmode = V4DImode;
31010 nargs_constant = 1;
31011 break;
31012 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
31013 nargs = 3;
31014 rmode = V2DImode;
31015 nargs_constant = 1;
31016 break;
31017 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
31018 nargs = 3;
31019 rmode = DImode;
31020 nargs_constant = 1;
31021 break;
31022 case V2DI_FTYPE_V2DI_UINT_UINT:
31023 nargs = 3;
31024 nargs_constant = 2;
31025 break;
31026 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
31027 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
31028 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
31029 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
31030 nargs = 4;
31031 nargs_constant = 1;
31032 break;
31033 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
31034 nargs = 4;
31035 nargs_constant = 2;
31036 break;
31037 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
31038 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
31039 nargs = 4;
31040 break;
31041 default:
31042 gcc_unreachable ();
31045 gcc_assert (nargs <= ARRAY_SIZE (args));
31047 if (comparison != UNKNOWN)
31049 gcc_assert (nargs == 2);
31050 return ix86_expand_sse_compare (d, exp, target, swap);
31053 if (rmode == VOIDmode || rmode == tmode)
31055 if (optimize
31056 || target == 0
31057 || GET_MODE (target) != tmode
31058 || !insn_p->operand[0].predicate (target, tmode))
31059 target = gen_reg_rtx (tmode);
31060 real_target = target;
31062 else
31064 target = gen_reg_rtx (rmode);
31065 real_target = simplify_gen_subreg (tmode, target, rmode, 0);
31068 for (i = 0; i < nargs; i++)
31070 tree arg = CALL_EXPR_ARG (exp, i);
31071 rtx op = expand_normal (arg);
31072 enum machine_mode mode = insn_p->operand[i + 1].mode;
31073 bool match = insn_p->operand[i + 1].predicate (op, mode);
31075 if (last_arg_count && (i + 1) == nargs)
31077 /* SIMD shift insns take either an 8-bit immediate or
31078 register as count. But builtin functions take int as
31079 count. If count doesn't match, we put it in register. */
31080 if (!match)
31082 op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
31083 if (!insn_p->operand[i + 1].predicate (op, mode))
31084 op = copy_to_reg (op);
31087 else if ((nargs - i) <= nargs_constant)
31089 if (!match)
31090 switch (icode)
31092 case CODE_FOR_avx2_inserti128:
31093 case CODE_FOR_avx2_extracti128:
31094 error ("the last argument must be an 1-bit immediate");
31095 return const0_rtx;
31097 case CODE_FOR_sse4_1_roundsd:
31098 case CODE_FOR_sse4_1_roundss:
31100 case CODE_FOR_sse4_1_roundpd:
31101 case CODE_FOR_sse4_1_roundps:
31102 case CODE_FOR_avx_roundpd256:
31103 case CODE_FOR_avx_roundps256:
31105 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
31106 case CODE_FOR_sse4_1_roundps_sfix:
31107 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
31108 case CODE_FOR_avx_roundps_sfix256:
31110 case CODE_FOR_sse4_1_blendps:
31111 case CODE_FOR_avx_blendpd256:
31112 case CODE_FOR_avx_vpermilv4df:
31113 error ("the last argument must be a 4-bit immediate");
31114 return const0_rtx;
31116 case CODE_FOR_sse4_1_blendpd:
31117 case CODE_FOR_avx_vpermilv2df:
31118 case CODE_FOR_xop_vpermil2v2df3:
31119 case CODE_FOR_xop_vpermil2v4sf3:
31120 case CODE_FOR_xop_vpermil2v4df3:
31121 case CODE_FOR_xop_vpermil2v8sf3:
31122 error ("the last argument must be a 2-bit immediate");
31123 return const0_rtx;
31125 case CODE_FOR_avx_vextractf128v4df:
31126 case CODE_FOR_avx_vextractf128v8sf:
31127 case CODE_FOR_avx_vextractf128v8si:
31128 case CODE_FOR_avx_vinsertf128v4df:
31129 case CODE_FOR_avx_vinsertf128v8sf:
31130 case CODE_FOR_avx_vinsertf128v8si:
31131 error ("the last argument must be a 1-bit immediate");
31132 return const0_rtx;
31134 case CODE_FOR_avx_vmcmpv2df3:
31135 case CODE_FOR_avx_vmcmpv4sf3:
31136 case CODE_FOR_avx_cmpv2df3:
31137 case CODE_FOR_avx_cmpv4sf3:
31138 case CODE_FOR_avx_cmpv4df3:
31139 case CODE_FOR_avx_cmpv8sf3:
31140 error ("the last argument must be a 5-bit immediate");
31141 return const0_rtx;
31143 default:
31144 switch (nargs_constant)
31146 case 2:
31147 if ((nargs - i) == nargs_constant)
31149 error ("the next to last argument must be an 8-bit immediate");
31150 break;
31152 case 1:
31153 error ("the last argument must be an 8-bit immediate");
31154 break;
31155 default:
31156 gcc_unreachable ();
31158 return const0_rtx;
31161 else
31163 if (VECTOR_MODE_P (mode))
31164 op = safe_vector_operand (op, mode);
31166 /* If we aren't optimizing, only allow one memory operand to
31167 be generated. */
31168 if (memory_operand (op, mode))
31169 num_memory++;
31171 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
31173 if (optimize || !match || num_memory > 1)
31174 op = copy_to_mode_reg (mode, op);
31176 else
31178 op = copy_to_reg (op);
31179 op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
31183 args[i].op = op;
31184 args[i].mode = mode;
31187 switch (nargs)
31189 case 1:
31190 pat = GEN_FCN (icode) (real_target, args[0].op);
31191 break;
31192 case 2:
31193 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
31194 break;
31195 case 3:
31196 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
31197 args[2].op);
31198 break;
31199 case 4:
31200 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
31201 args[2].op, args[3].op);
31202 break;
31203 default:
31204 gcc_unreachable ();
31207 if (! pat)
31208 return 0;
31210 emit_insn (pat);
31211 return target;
31214 /* Subroutine of ix86_expand_builtin to take care of special insns
31215 with variable number of operands. */
31217 static rtx
31218 ix86_expand_special_args_builtin (const struct builtin_description *d,
31219 tree exp, rtx target)
31221 tree arg;
31222 rtx pat, op;
31223 unsigned int i, nargs, arg_adjust, memory;
31224 struct
31226 rtx op;
31227 enum machine_mode mode;
31228 } args[3];
31229 enum insn_code icode = d->icode;
31230 bool last_arg_constant = false;
31231 const struct insn_data_d *insn_p = &insn_data[icode];
31232 enum machine_mode tmode = insn_p->operand[0].mode;
31233 enum { load, store } klass;
31235 switch ((enum ix86_builtin_func_type) d->flag)
31237 case VOID_FTYPE_VOID:
31238 emit_insn (GEN_FCN (icode) (target));
31239 return 0;
31240 case VOID_FTYPE_UINT64:
31241 case VOID_FTYPE_UNSIGNED:
31242 nargs = 0;
31243 klass = store;
31244 memory = 0;
31245 break;
31247 case INT_FTYPE_VOID:
31248 case UINT64_FTYPE_VOID:
31249 case UNSIGNED_FTYPE_VOID:
31250 nargs = 0;
31251 klass = load;
31252 memory = 0;
31253 break;
31254 case UINT64_FTYPE_PUNSIGNED:
31255 case V2DI_FTYPE_PV2DI:
31256 case V4DI_FTYPE_PV4DI:
31257 case V32QI_FTYPE_PCCHAR:
31258 case V16QI_FTYPE_PCCHAR:
31259 case V8SF_FTYPE_PCV4SF:
31260 case V8SF_FTYPE_PCFLOAT:
31261 case V4SF_FTYPE_PCFLOAT:
31262 case V4DF_FTYPE_PCV2DF:
31263 case V4DF_FTYPE_PCDOUBLE:
31264 case V2DF_FTYPE_PCDOUBLE:
31265 case VOID_FTYPE_PVOID:
31266 nargs = 1;
31267 klass = load;
31268 memory = 0;
31269 break;
31270 case VOID_FTYPE_PV2SF_V4SF:
31271 case VOID_FTYPE_PV4DI_V4DI:
31272 case VOID_FTYPE_PV2DI_V2DI:
31273 case VOID_FTYPE_PCHAR_V32QI:
31274 case VOID_FTYPE_PCHAR_V16QI:
31275 case VOID_FTYPE_PFLOAT_V8SF:
31276 case VOID_FTYPE_PFLOAT_V4SF:
31277 case VOID_FTYPE_PDOUBLE_V4DF:
31278 case VOID_FTYPE_PDOUBLE_V2DF:
31279 case VOID_FTYPE_PLONGLONG_LONGLONG:
31280 case VOID_FTYPE_PULONGLONG_ULONGLONG:
31281 case VOID_FTYPE_PINT_INT:
31282 nargs = 1;
31283 klass = store;
31284 /* Reserve memory operand for target. */
31285 memory = ARRAY_SIZE (args);
31286 break;
31287 case V4SF_FTYPE_V4SF_PCV2SF:
31288 case V2DF_FTYPE_V2DF_PCDOUBLE:
31289 nargs = 2;
31290 klass = load;
31291 memory = 1;
31292 break;
31293 case V8SF_FTYPE_PCV8SF_V8SI:
31294 case V4DF_FTYPE_PCV4DF_V4DI:
31295 case V4SF_FTYPE_PCV4SF_V4SI:
31296 case V2DF_FTYPE_PCV2DF_V2DI:
31297 case V8SI_FTYPE_PCV8SI_V8SI:
31298 case V4DI_FTYPE_PCV4DI_V4DI:
31299 case V4SI_FTYPE_PCV4SI_V4SI:
31300 case V2DI_FTYPE_PCV2DI_V2DI:
31301 nargs = 2;
31302 klass = load;
31303 memory = 0;
31304 break;
31305 case VOID_FTYPE_PV8SF_V8SI_V8SF:
31306 case VOID_FTYPE_PV4DF_V4DI_V4DF:
31307 case VOID_FTYPE_PV4SF_V4SI_V4SF:
31308 case VOID_FTYPE_PV2DF_V2DI_V2DF:
31309 case VOID_FTYPE_PV8SI_V8SI_V8SI:
31310 case VOID_FTYPE_PV4DI_V4DI_V4DI:
31311 case VOID_FTYPE_PV4SI_V4SI_V4SI:
31312 case VOID_FTYPE_PV2DI_V2DI_V2DI:
31313 nargs = 2;
31314 klass = store;
31315 /* Reserve memory operand for target. */
31316 memory = ARRAY_SIZE (args);
31317 break;
31318 case VOID_FTYPE_UINT_UINT_UINT:
31319 case VOID_FTYPE_UINT64_UINT_UINT:
31320 case UCHAR_FTYPE_UINT_UINT_UINT:
31321 case UCHAR_FTYPE_UINT64_UINT_UINT:
31322 nargs = 3;
31323 klass = load;
31324 memory = ARRAY_SIZE (args);
31325 last_arg_constant = true;
31326 break;
31327 default:
31328 gcc_unreachable ();
31331 gcc_assert (nargs <= ARRAY_SIZE (args));
31333 if (klass == store)
31335 arg = CALL_EXPR_ARG (exp, 0);
31336 op = expand_normal (arg);
31337 gcc_assert (target == 0);
31338 if (memory)
31340 op = force_reg (Pmode, convert_to_mode (Pmode, op, 1));
31341 target = gen_rtx_MEM (tmode, op);
31343 else
31344 target = force_reg (tmode, op);
31345 arg_adjust = 1;
31347 else
31349 arg_adjust = 0;
31350 if (optimize
31351 || target == 0
31352 || !register_operand (target, tmode)
31353 || GET_MODE (target) != tmode)
31354 target = gen_reg_rtx (tmode);
31357 for (i = 0; i < nargs; i++)
31359 enum machine_mode mode = insn_p->operand[i + 1].mode;
31360 bool match;
31362 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
31363 op = expand_normal (arg);
31364 match = insn_p->operand[i + 1].predicate (op, mode);
31366 if (last_arg_constant && (i + 1) == nargs)
31368 if (!match)
31370 if (icode == CODE_FOR_lwp_lwpvalsi3
31371 || icode == CODE_FOR_lwp_lwpinssi3
31372 || icode == CODE_FOR_lwp_lwpvaldi3
31373 || icode == CODE_FOR_lwp_lwpinsdi3)
31374 error ("the last argument must be a 32-bit immediate");
31375 else
31376 error ("the last argument must be an 8-bit immediate");
31377 return const0_rtx;
31380 else
31382 if (i == memory)
31384 /* This must be the memory operand. */
31385 op = force_reg (Pmode, convert_to_mode (Pmode, op, 1));
31386 op = gen_rtx_MEM (mode, op);
31387 gcc_assert (GET_MODE (op) == mode
31388 || GET_MODE (op) == VOIDmode);
31390 else
31392 /* This must be register. */
31393 if (VECTOR_MODE_P (mode))
31394 op = safe_vector_operand (op, mode);
31396 gcc_assert (GET_MODE (op) == mode
31397 || GET_MODE (op) == VOIDmode);
31398 op = copy_to_mode_reg (mode, op);
31402 args[i].op = op;
31403 args[i].mode = mode;
31406 switch (nargs)
31408 case 0:
31409 pat = GEN_FCN (icode) (target);
31410 break;
31411 case 1:
31412 pat = GEN_FCN (icode) (target, args[0].op);
31413 break;
31414 case 2:
31415 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
31416 break;
31417 case 3:
31418 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
31419 break;
31420 default:
31421 gcc_unreachable ();
31424 if (! pat)
31425 return 0;
31426 emit_insn (pat);
31427 return klass == store ? 0 : target;
31430 /* Return the integer constant in ARG. Constrain it to be in the range
31431 of the subparts of VEC_TYPE; issue an error if not. */
31433 static int
31434 get_element_number (tree vec_type, tree arg)
31436 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
31438 if (!host_integerp (arg, 1)
31439 || (elt = tree_low_cst (arg, 1), elt > max))
31441 error ("selector must be an integer constant in the range 0..%wi", max);
31442 return 0;
31445 return elt;
31448 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
31449 ix86_expand_vector_init. We DO have language-level syntax for this, in
31450 the form of (type){ init-list }. Except that since we can't place emms
31451 instructions from inside the compiler, we can't allow the use of MMX
31452 registers unless the user explicitly asks for it. So we do *not* define
31453 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
31454 we have builtins invoked by mmintrin.h that gives us license to emit
31455 these sorts of instructions. */
31457 static rtx
31458 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
31460 enum machine_mode tmode = TYPE_MODE (type);
31461 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
31462 int i, n_elt = GET_MODE_NUNITS (tmode);
31463 rtvec v = rtvec_alloc (n_elt);
31465 gcc_assert (VECTOR_MODE_P (tmode));
31466 gcc_assert (call_expr_nargs (exp) == n_elt);
31468 for (i = 0; i < n_elt; ++i)
31470 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
31471 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
31474 if (!target || !register_operand (target, tmode))
31475 target = gen_reg_rtx (tmode);
31477 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
31478 return target;
31481 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
31482 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
31483 had a language-level syntax for referencing vector elements. */
31485 static rtx
31486 ix86_expand_vec_ext_builtin (tree exp, rtx target)
31488 enum machine_mode tmode, mode0;
31489 tree arg0, arg1;
31490 int elt;
31491 rtx op0;
31493 arg0 = CALL_EXPR_ARG (exp, 0);
31494 arg1 = CALL_EXPR_ARG (exp, 1);
31496 op0 = expand_normal (arg0);
31497 elt = get_element_number (TREE_TYPE (arg0), arg1);
31499 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
31500 mode0 = TYPE_MODE (TREE_TYPE (arg0));
31501 gcc_assert (VECTOR_MODE_P (mode0));
31503 op0 = force_reg (mode0, op0);
31505 if (optimize || !target || !register_operand (target, tmode))
31506 target = gen_reg_rtx (tmode);
31508 ix86_expand_vector_extract (true, target, op0, elt);
31510 return target;
31513 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
31514 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
31515 a language-level syntax for referencing vector elements. */
31517 static rtx
31518 ix86_expand_vec_set_builtin (tree exp)
31520 enum machine_mode tmode, mode1;
31521 tree arg0, arg1, arg2;
31522 int elt;
31523 rtx op0, op1, target;
31525 arg0 = CALL_EXPR_ARG (exp, 0);
31526 arg1 = CALL_EXPR_ARG (exp, 1);
31527 arg2 = CALL_EXPR_ARG (exp, 2);
31529 tmode = TYPE_MODE (TREE_TYPE (arg0));
31530 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
31531 gcc_assert (VECTOR_MODE_P (tmode));
31533 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
31534 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
31535 elt = get_element_number (TREE_TYPE (arg0), arg2);
31537 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
31538 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
31540 op0 = force_reg (tmode, op0);
31541 op1 = force_reg (mode1, op1);
31543 /* OP0 is the source of these builtin functions and shouldn't be
31544 modified. Create a copy, use it and return it as target. */
31545 target = gen_reg_rtx (tmode);
31546 emit_move_insn (target, op0);
31547 ix86_expand_vector_set (true, target, op1, elt);
31549 return target;
31552 /* Expand an expression EXP that calls a built-in function,
31553 with result going to TARGET if that's convenient
31554 (and in mode MODE if that's convenient).
31555 SUBTARGET may be used as the target for computing one of EXP's operands.
31556 IGNORE is nonzero if the value is to be ignored. */
31558 static rtx
31559 ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
31560 enum machine_mode mode ATTRIBUTE_UNUSED,
31561 int ignore ATTRIBUTE_UNUSED)
31563 const struct builtin_description *d;
31564 size_t i;
31565 enum insn_code icode;
31566 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
31567 tree arg0, arg1, arg2, arg3, arg4;
31568 rtx op0, op1, op2, op3, op4, pat, insn;
31569 enum machine_mode mode0, mode1, mode2, mode3, mode4;
31570 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
31572 /* For CPU builtins that can be folded, fold first and expand the fold. */
31573 switch (fcode)
31575 case IX86_BUILTIN_CPU_INIT:
31577 /* Make it call __cpu_indicator_init in libgcc. */
31578 tree call_expr, fndecl, type;
31579 type = build_function_type_list (integer_type_node, NULL_TREE);
31580 fndecl = build_fn_decl ("__cpu_indicator_init", type);
31581 call_expr = build_call_expr (fndecl, 0);
31582 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
31584 case IX86_BUILTIN_CPU_IS:
31585 case IX86_BUILTIN_CPU_SUPPORTS:
31587 tree arg0 = CALL_EXPR_ARG (exp, 0);
31588 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
31589 gcc_assert (fold_expr != NULL_TREE);
31590 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
31594 /* Determine whether the builtin function is available under the current ISA.
31595 Originally the builtin was not created if it wasn't applicable to the
31596 current ISA based on the command line switches. With function specific
31597 options, we need to check in the context of the function making the call
31598 whether it is supported. */
31599 if (ix86_builtins_isa[fcode].isa
31600 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
31602 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
31603 NULL, (enum fpmath_unit) 0, false);
31605 if (!opts)
31606 error ("%qE needs unknown isa option", fndecl);
31607 else
31609 gcc_assert (opts != NULL);
31610 error ("%qE needs isa option %s", fndecl, opts);
31611 free (opts);
31613 return const0_rtx;
31616 switch (fcode)
31618 case IX86_BUILTIN_MASKMOVQ:
31619 case IX86_BUILTIN_MASKMOVDQU:
31620 icode = (fcode == IX86_BUILTIN_MASKMOVQ
31621 ? CODE_FOR_mmx_maskmovq
31622 : CODE_FOR_sse2_maskmovdqu);
31623 /* Note the arg order is different from the operand order. */
31624 arg1 = CALL_EXPR_ARG (exp, 0);
31625 arg2 = CALL_EXPR_ARG (exp, 1);
31626 arg0 = CALL_EXPR_ARG (exp, 2);
31627 op0 = expand_normal (arg0);
31628 op1 = expand_normal (arg1);
31629 op2 = expand_normal (arg2);
31630 mode0 = insn_data[icode].operand[0].mode;
31631 mode1 = insn_data[icode].operand[1].mode;
31632 mode2 = insn_data[icode].operand[2].mode;
31634 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
31635 op0 = gen_rtx_MEM (mode1, op0);
31637 if (!insn_data[icode].operand[0].predicate (op0, mode0))
31638 op0 = copy_to_mode_reg (mode0, op0);
31639 if (!insn_data[icode].operand[1].predicate (op1, mode1))
31640 op1 = copy_to_mode_reg (mode1, op1);
31641 if (!insn_data[icode].operand[2].predicate (op2, mode2))
31642 op2 = copy_to_mode_reg (mode2, op2);
31643 pat = GEN_FCN (icode) (op0, op1, op2);
31644 if (! pat)
31645 return 0;
31646 emit_insn (pat);
31647 return 0;
31649 case IX86_BUILTIN_LDMXCSR:
31650 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
31651 target = assign_386_stack_local (SImode, SLOT_TEMP);
31652 emit_move_insn (target, op0);
31653 emit_insn (gen_sse_ldmxcsr (target));
31654 return 0;
31656 case IX86_BUILTIN_STMXCSR:
31657 target = assign_386_stack_local (SImode, SLOT_TEMP);
31658 emit_insn (gen_sse_stmxcsr (target));
31659 return copy_to_mode_reg (SImode, target);
31661 case IX86_BUILTIN_CLFLUSH:
31662 arg0 = CALL_EXPR_ARG (exp, 0);
31663 op0 = expand_normal (arg0);
31664 icode = CODE_FOR_sse2_clflush;
31665 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
31666 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
31668 emit_insn (gen_sse2_clflush (op0));
31669 return 0;
31671 case IX86_BUILTIN_MONITOR:
31672 arg0 = CALL_EXPR_ARG (exp, 0);
31673 arg1 = CALL_EXPR_ARG (exp, 1);
31674 arg2 = CALL_EXPR_ARG (exp, 2);
31675 op0 = expand_normal (arg0);
31676 op1 = expand_normal (arg1);
31677 op2 = expand_normal (arg2);
31678 if (!REG_P (op0))
31679 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
31680 if (!REG_P (op1))
31681 op1 = copy_to_mode_reg (SImode, op1);
31682 if (!REG_P (op2))
31683 op2 = copy_to_mode_reg (SImode, op2);
31684 emit_insn (ix86_gen_monitor (op0, op1, op2));
31685 return 0;
31687 case IX86_BUILTIN_MWAIT:
31688 arg0 = CALL_EXPR_ARG (exp, 0);
31689 arg1 = CALL_EXPR_ARG (exp, 1);
31690 op0 = expand_normal (arg0);
31691 op1 = expand_normal (arg1);
31692 if (!REG_P (op0))
31693 op0 = copy_to_mode_reg (SImode, op0);
31694 if (!REG_P (op1))
31695 op1 = copy_to_mode_reg (SImode, op1);
31696 emit_insn (gen_sse3_mwait (op0, op1));
31697 return 0;
31699 case IX86_BUILTIN_VEC_INIT_V2SI:
31700 case IX86_BUILTIN_VEC_INIT_V4HI:
31701 case IX86_BUILTIN_VEC_INIT_V8QI:
31702 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
31704 case IX86_BUILTIN_VEC_EXT_V2DF:
31705 case IX86_BUILTIN_VEC_EXT_V2DI:
31706 case IX86_BUILTIN_VEC_EXT_V4SF:
31707 case IX86_BUILTIN_VEC_EXT_V4SI:
31708 case IX86_BUILTIN_VEC_EXT_V8HI:
31709 case IX86_BUILTIN_VEC_EXT_V2SI:
31710 case IX86_BUILTIN_VEC_EXT_V4HI:
31711 case IX86_BUILTIN_VEC_EXT_V16QI:
31712 return ix86_expand_vec_ext_builtin (exp, target);
31714 case IX86_BUILTIN_VEC_SET_V2DI:
31715 case IX86_BUILTIN_VEC_SET_V4SF:
31716 case IX86_BUILTIN_VEC_SET_V4SI:
31717 case IX86_BUILTIN_VEC_SET_V8HI:
31718 case IX86_BUILTIN_VEC_SET_V4HI:
31719 case IX86_BUILTIN_VEC_SET_V16QI:
31720 return ix86_expand_vec_set_builtin (exp);
31722 case IX86_BUILTIN_INFQ:
31723 case IX86_BUILTIN_HUGE_VALQ:
31725 REAL_VALUE_TYPE inf;
31726 rtx tmp;
31728 real_inf (&inf);
31729 tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
31731 tmp = validize_mem (force_const_mem (mode, tmp));
31733 if (target == 0)
31734 target = gen_reg_rtx (mode);
31736 emit_move_insn (target, tmp);
31737 return target;
31740 case IX86_BUILTIN_RDPMC:
31741 case IX86_BUILTIN_RDTSC:
31742 case IX86_BUILTIN_RDTSCP:
31744 op0 = gen_reg_rtx (DImode);
31745 op1 = gen_reg_rtx (DImode);
31747 if (fcode == IX86_BUILTIN_RDPMC)
31749 arg0 = CALL_EXPR_ARG (exp, 0);
31750 op2 = expand_normal (arg0);
31751 if (!register_operand (op2, SImode))
31752 op2 = copy_to_mode_reg (SImode, op2);
31754 insn = (TARGET_64BIT
31755 ? gen_rdpmc_rex64 (op0, op1, op2)
31756 : gen_rdpmc (op0, op2));
31757 emit_insn (insn);
31759 else if (fcode == IX86_BUILTIN_RDTSC)
31761 insn = (TARGET_64BIT
31762 ? gen_rdtsc_rex64 (op0, op1)
31763 : gen_rdtsc (op0));
31764 emit_insn (insn);
31766 else
31768 op2 = gen_reg_rtx (SImode);
31770 insn = (TARGET_64BIT
31771 ? gen_rdtscp_rex64 (op0, op1, op2)
31772 : gen_rdtscp (op0, op2));
31773 emit_insn (insn);
31775 arg0 = CALL_EXPR_ARG (exp, 0);
31776 op4 = expand_normal (arg0);
31777 if (!address_operand (op4, VOIDmode))
31779 op4 = convert_memory_address (Pmode, op4);
31780 op4 = copy_addr_to_reg (op4);
31782 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
31785 if (target == 0)
31786 target = gen_reg_rtx (mode);
31788 if (TARGET_64BIT)
31790 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
31791 op1, 1, OPTAB_DIRECT);
31792 op0 = expand_simple_binop (DImode, IOR, op0, op1,
31793 op0, 1, OPTAB_DIRECT);
31796 emit_move_insn (target, op0);
31797 return target;
31799 case IX86_BUILTIN_FXSAVE:
31800 case IX86_BUILTIN_FXRSTOR:
31801 case IX86_BUILTIN_FXSAVE64:
31802 case IX86_BUILTIN_FXRSTOR64:
31803 switch (fcode)
31805 case IX86_BUILTIN_FXSAVE:
31806 icode = CODE_FOR_fxsave;
31807 break;
31808 case IX86_BUILTIN_FXRSTOR:
31809 icode = CODE_FOR_fxrstor;
31810 break;
31811 case IX86_BUILTIN_FXSAVE64:
31812 icode = CODE_FOR_fxsave64;
31813 break;
31814 case IX86_BUILTIN_FXRSTOR64:
31815 icode = CODE_FOR_fxrstor64;
31816 break;
31817 default:
31818 gcc_unreachable ();
31821 arg0 = CALL_EXPR_ARG (exp, 0);
31822 op0 = expand_normal (arg0);
31824 if (!address_operand (op0, VOIDmode))
31826 op0 = convert_memory_address (Pmode, op0);
31827 op0 = copy_addr_to_reg (op0);
31829 op0 = gen_rtx_MEM (BLKmode, op0);
31831 pat = GEN_FCN (icode) (op0);
31832 if (pat)
31833 emit_insn (pat);
31834 return 0;
31836 case IX86_BUILTIN_XSAVE:
31837 case IX86_BUILTIN_XRSTOR:
31838 case IX86_BUILTIN_XSAVE64:
31839 case IX86_BUILTIN_XRSTOR64:
31840 case IX86_BUILTIN_XSAVEOPT:
31841 case IX86_BUILTIN_XSAVEOPT64:
31842 arg0 = CALL_EXPR_ARG (exp, 0);
31843 arg1 = CALL_EXPR_ARG (exp, 1);
31844 op0 = expand_normal (arg0);
31845 op1 = expand_normal (arg1);
31847 if (!address_operand (op0, VOIDmode))
31849 op0 = convert_memory_address (Pmode, op0);
31850 op0 = copy_addr_to_reg (op0);
31852 op0 = gen_rtx_MEM (BLKmode, op0);
31854 op1 = force_reg (DImode, op1);
31856 if (TARGET_64BIT)
31858 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
31859 NULL, 1, OPTAB_DIRECT);
31860 switch (fcode)
31862 case IX86_BUILTIN_XSAVE:
31863 icode = CODE_FOR_xsave_rex64;
31864 break;
31865 case IX86_BUILTIN_XRSTOR:
31866 icode = CODE_FOR_xrstor_rex64;
31867 break;
31868 case IX86_BUILTIN_XSAVE64:
31869 icode = CODE_FOR_xsave64;
31870 break;
31871 case IX86_BUILTIN_XRSTOR64:
31872 icode = CODE_FOR_xrstor64;
31873 break;
31874 case IX86_BUILTIN_XSAVEOPT:
31875 icode = CODE_FOR_xsaveopt_rex64;
31876 break;
31877 case IX86_BUILTIN_XSAVEOPT64:
31878 icode = CODE_FOR_xsaveopt64;
31879 break;
31880 default:
31881 gcc_unreachable ();
31884 op2 = gen_lowpart (SImode, op2);
31885 op1 = gen_lowpart (SImode, op1);
31886 pat = GEN_FCN (icode) (op0, op1, op2);
31888 else
31890 switch (fcode)
31892 case IX86_BUILTIN_XSAVE:
31893 icode = CODE_FOR_xsave;
31894 break;
31895 case IX86_BUILTIN_XRSTOR:
31896 icode = CODE_FOR_xrstor;
31897 break;
31898 case IX86_BUILTIN_XSAVEOPT:
31899 icode = CODE_FOR_xsaveopt;
31900 break;
31901 default:
31902 gcc_unreachable ();
31904 pat = GEN_FCN (icode) (op0, op1);
31907 if (pat)
31908 emit_insn (pat);
31909 return 0;
31911 case IX86_BUILTIN_LLWPCB:
31912 arg0 = CALL_EXPR_ARG (exp, 0);
31913 op0 = expand_normal (arg0);
31914 icode = CODE_FOR_lwp_llwpcb;
31915 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
31916 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
31917 emit_insn (gen_lwp_llwpcb (op0));
31918 return 0;
31920 case IX86_BUILTIN_SLWPCB:
31921 icode = CODE_FOR_lwp_slwpcb;
31922 if (!target
31923 || !insn_data[icode].operand[0].predicate (target, Pmode))
31924 target = gen_reg_rtx (Pmode);
31925 emit_insn (gen_lwp_slwpcb (target));
31926 return target;
31928 case IX86_BUILTIN_BEXTRI32:
31929 case IX86_BUILTIN_BEXTRI64:
31930 arg0 = CALL_EXPR_ARG (exp, 0);
31931 arg1 = CALL_EXPR_ARG (exp, 1);
31932 op0 = expand_normal (arg0);
31933 op1 = expand_normal (arg1);
31934 icode = (fcode == IX86_BUILTIN_BEXTRI32
31935 ? CODE_FOR_tbm_bextri_si
31936 : CODE_FOR_tbm_bextri_di);
31937 if (!CONST_INT_P (op1))
31939 error ("last argument must be an immediate");
31940 return const0_rtx;
31942 else
31944 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
31945 unsigned char lsb_index = INTVAL (op1) & 0xFF;
31946 op1 = GEN_INT (length);
31947 op2 = GEN_INT (lsb_index);
31948 pat = GEN_FCN (icode) (target, op0, op1, op2);
31949 if (pat)
31950 emit_insn (pat);
31951 return target;
31954 case IX86_BUILTIN_RDRAND16_STEP:
31955 icode = CODE_FOR_rdrandhi_1;
31956 mode0 = HImode;
31957 goto rdrand_step;
31959 case IX86_BUILTIN_RDRAND32_STEP:
31960 icode = CODE_FOR_rdrandsi_1;
31961 mode0 = SImode;
31962 goto rdrand_step;
31964 case IX86_BUILTIN_RDRAND64_STEP:
31965 icode = CODE_FOR_rdranddi_1;
31966 mode0 = DImode;
31968 rdrand_step:
31969 op0 = gen_reg_rtx (mode0);
31970 emit_insn (GEN_FCN (icode) (op0));
31972 arg0 = CALL_EXPR_ARG (exp, 0);
31973 op1 = expand_normal (arg0);
31974 if (!address_operand (op1, VOIDmode))
31976 op1 = convert_memory_address (Pmode, op1);
31977 op1 = copy_addr_to_reg (op1);
31979 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
31981 op1 = gen_reg_rtx (SImode);
31982 emit_move_insn (op1, CONST1_RTX (SImode));
31984 /* Emit SImode conditional move. */
31985 if (mode0 == HImode)
31987 op2 = gen_reg_rtx (SImode);
31988 emit_insn (gen_zero_extendhisi2 (op2, op0));
31990 else if (mode0 == SImode)
31991 op2 = op0;
31992 else
31993 op2 = gen_rtx_SUBREG (SImode, op0, 0);
31995 if (target == 0)
31996 target = gen_reg_rtx (SImode);
31998 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
31999 const0_rtx);
32000 emit_insn (gen_rtx_SET (VOIDmode, target,
32001 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
32002 return target;
32004 case IX86_BUILTIN_RDSEED16_STEP:
32005 icode = CODE_FOR_rdseedhi_1;
32006 mode0 = HImode;
32007 goto rdseed_step;
32009 case IX86_BUILTIN_RDSEED32_STEP:
32010 icode = CODE_FOR_rdseedsi_1;
32011 mode0 = SImode;
32012 goto rdseed_step;
32014 case IX86_BUILTIN_RDSEED64_STEP:
32015 icode = CODE_FOR_rdseeddi_1;
32016 mode0 = DImode;
32018 rdseed_step:
32019 op0 = gen_reg_rtx (mode0);
32020 emit_insn (GEN_FCN (icode) (op0));
32022 arg0 = CALL_EXPR_ARG (exp, 0);
32023 op1 = expand_normal (arg0);
32024 if (!address_operand (op1, VOIDmode))
32026 op1 = convert_memory_address (Pmode, op1);
32027 op1 = copy_addr_to_reg (op1);
32029 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
32031 op2 = gen_reg_rtx (QImode);
32033 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
32034 const0_rtx);
32035 emit_insn (gen_rtx_SET (VOIDmode, op2, pat));
32037 if (target == 0)
32038 target = gen_reg_rtx (SImode);
32040 emit_insn (gen_zero_extendqisi2 (target, op2));
32041 return target;
32043 case IX86_BUILTIN_ADDCARRYX32:
32044 icode = TARGET_ADX ? CODE_FOR_adcxsi3 : CODE_FOR_addsi3_carry;
32045 mode0 = SImode;
32046 goto addcarryx;
32048 case IX86_BUILTIN_ADDCARRYX64:
32049 icode = TARGET_ADX ? CODE_FOR_adcxdi3 : CODE_FOR_adddi3_carry;
32050 mode0 = DImode;
32052 addcarryx:
32053 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
32054 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
32055 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
32056 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
32058 op0 = gen_reg_rtx (QImode);
32060 /* Generate CF from input operand. */
32061 op1 = expand_normal (arg0);
32062 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
32063 emit_insn (gen_addqi3_cc (op0, op1, constm1_rtx));
32065 /* Gen ADCX instruction to compute X+Y+CF. */
32066 op2 = expand_normal (arg1);
32067 op3 = expand_normal (arg2);
32069 if (!REG_P (op2))
32070 op2 = copy_to_mode_reg (mode0, op2);
32071 if (!REG_P (op3))
32072 op3 = copy_to_mode_reg (mode0, op3);
32074 op0 = gen_reg_rtx (mode0);
32076 op4 = gen_rtx_REG (CCCmode, FLAGS_REG);
32077 pat = gen_rtx_LTU (VOIDmode, op4, const0_rtx);
32078 emit_insn (GEN_FCN (icode) (op0, op2, op3, op4, pat));
32080 /* Store the result. */
32081 op4 = expand_normal (arg3);
32082 if (!address_operand (op4, VOIDmode))
32084 op4 = convert_memory_address (Pmode, op4);
32085 op4 = copy_addr_to_reg (op4);
32087 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
32089 /* Return current CF value. */
32090 if (target == 0)
32091 target = gen_reg_rtx (QImode);
32093 PUT_MODE (pat, QImode);
32094 emit_insn (gen_rtx_SET (VOIDmode, target, pat));
32095 return target;
32097 case IX86_BUILTIN_GATHERSIV2DF:
32098 icode = CODE_FOR_avx2_gathersiv2df;
32099 goto gather_gen;
32100 case IX86_BUILTIN_GATHERSIV4DF:
32101 icode = CODE_FOR_avx2_gathersiv4df;
32102 goto gather_gen;
32103 case IX86_BUILTIN_GATHERDIV2DF:
32104 icode = CODE_FOR_avx2_gatherdiv2df;
32105 goto gather_gen;
32106 case IX86_BUILTIN_GATHERDIV4DF:
32107 icode = CODE_FOR_avx2_gatherdiv4df;
32108 goto gather_gen;
32109 case IX86_BUILTIN_GATHERSIV4SF:
32110 icode = CODE_FOR_avx2_gathersiv4sf;
32111 goto gather_gen;
32112 case IX86_BUILTIN_GATHERSIV8SF:
32113 icode = CODE_FOR_avx2_gathersiv8sf;
32114 goto gather_gen;
32115 case IX86_BUILTIN_GATHERDIV4SF:
32116 icode = CODE_FOR_avx2_gatherdiv4sf;
32117 goto gather_gen;
32118 case IX86_BUILTIN_GATHERDIV8SF:
32119 icode = CODE_FOR_avx2_gatherdiv8sf;
32120 goto gather_gen;
32121 case IX86_BUILTIN_GATHERSIV2DI:
32122 icode = CODE_FOR_avx2_gathersiv2di;
32123 goto gather_gen;
32124 case IX86_BUILTIN_GATHERSIV4DI:
32125 icode = CODE_FOR_avx2_gathersiv4di;
32126 goto gather_gen;
32127 case IX86_BUILTIN_GATHERDIV2DI:
32128 icode = CODE_FOR_avx2_gatherdiv2di;
32129 goto gather_gen;
32130 case IX86_BUILTIN_GATHERDIV4DI:
32131 icode = CODE_FOR_avx2_gatherdiv4di;
32132 goto gather_gen;
32133 case IX86_BUILTIN_GATHERSIV4SI:
32134 icode = CODE_FOR_avx2_gathersiv4si;
32135 goto gather_gen;
32136 case IX86_BUILTIN_GATHERSIV8SI:
32137 icode = CODE_FOR_avx2_gathersiv8si;
32138 goto gather_gen;
32139 case IX86_BUILTIN_GATHERDIV4SI:
32140 icode = CODE_FOR_avx2_gatherdiv4si;
32141 goto gather_gen;
32142 case IX86_BUILTIN_GATHERDIV8SI:
32143 icode = CODE_FOR_avx2_gatherdiv8si;
32144 goto gather_gen;
32145 case IX86_BUILTIN_GATHERALTSIV4DF:
32146 icode = CODE_FOR_avx2_gathersiv4df;
32147 goto gather_gen;
32148 case IX86_BUILTIN_GATHERALTDIV8SF:
32149 icode = CODE_FOR_avx2_gatherdiv8sf;
32150 goto gather_gen;
32151 case IX86_BUILTIN_GATHERALTSIV4DI:
32152 icode = CODE_FOR_avx2_gathersiv4di;
32153 goto gather_gen;
32154 case IX86_BUILTIN_GATHERALTDIV8SI:
32155 icode = CODE_FOR_avx2_gatherdiv8si;
32156 goto gather_gen;
32158 gather_gen:
32159 arg0 = CALL_EXPR_ARG (exp, 0);
32160 arg1 = CALL_EXPR_ARG (exp, 1);
32161 arg2 = CALL_EXPR_ARG (exp, 2);
32162 arg3 = CALL_EXPR_ARG (exp, 3);
32163 arg4 = CALL_EXPR_ARG (exp, 4);
32164 op0 = expand_normal (arg0);
32165 op1 = expand_normal (arg1);
32166 op2 = expand_normal (arg2);
32167 op3 = expand_normal (arg3);
32168 op4 = expand_normal (arg4);
32169 /* Note the arg order is different from the operand order. */
32170 mode0 = insn_data[icode].operand[1].mode;
32171 mode2 = insn_data[icode].operand[3].mode;
32172 mode3 = insn_data[icode].operand[4].mode;
32173 mode4 = insn_data[icode].operand[5].mode;
32175 if (target == NULL_RTX
32176 || GET_MODE (target) != insn_data[icode].operand[0].mode)
32177 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
32178 else
32179 subtarget = target;
32181 if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
32182 || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
32184 rtx half = gen_reg_rtx (V4SImode);
32185 if (!nonimmediate_operand (op2, V8SImode))
32186 op2 = copy_to_mode_reg (V8SImode, op2);
32187 emit_insn (gen_vec_extract_lo_v8si (half, op2));
32188 op2 = half;
32190 else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
32191 || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
32193 rtx (*gen) (rtx, rtx);
32194 rtx half = gen_reg_rtx (mode0);
32195 if (mode0 == V4SFmode)
32196 gen = gen_vec_extract_lo_v8sf;
32197 else
32198 gen = gen_vec_extract_lo_v8si;
32199 if (!nonimmediate_operand (op0, GET_MODE (op0)))
32200 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
32201 emit_insn (gen (half, op0));
32202 op0 = half;
32203 if (!nonimmediate_operand (op3, GET_MODE (op3)))
32204 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
32205 emit_insn (gen (half, op3));
32206 op3 = half;
32209 /* Force memory operand only with base register here. But we
32210 don't want to do it on memory operand for other builtin
32211 functions. */
32212 op1 = force_reg (Pmode, convert_to_mode (Pmode, op1, 1));
32214 if (!insn_data[icode].operand[1].predicate (op0, mode0))
32215 op0 = copy_to_mode_reg (mode0, op0);
32216 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
32217 op1 = copy_to_mode_reg (Pmode, op1);
32218 if (!insn_data[icode].operand[3].predicate (op2, mode2))
32219 op2 = copy_to_mode_reg (mode2, op2);
32220 if (!insn_data[icode].operand[4].predicate (op3, mode3))
32221 op3 = copy_to_mode_reg (mode3, op3);
32222 if (!insn_data[icode].operand[5].predicate (op4, mode4))
32224 error ("last argument must be scale 1, 2, 4, 8");
32225 return const0_rtx;
32228 /* Optimize. If mask is known to have all high bits set,
32229 replace op0 with pc_rtx to signal that the instruction
32230 overwrites the whole destination and doesn't use its
32231 previous contents. */
32232 if (optimize)
32234 if (TREE_CODE (arg3) == VECTOR_CST)
32236 unsigned int negative = 0;
32237 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
32239 tree cst = VECTOR_CST_ELT (arg3, i);
32240 if (TREE_CODE (cst) == INTEGER_CST
32241 && tree_int_cst_sign_bit (cst))
32242 negative++;
32243 else if (TREE_CODE (cst) == REAL_CST
32244 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
32245 negative++;
32247 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
32248 op0 = pc_rtx;
32250 else if (TREE_CODE (arg3) == SSA_NAME)
32252 /* Recognize also when mask is like:
32253 __v2df src = _mm_setzero_pd ();
32254 __v2df mask = _mm_cmpeq_pd (src, src);
32256 __v8sf src = _mm256_setzero_ps ();
32257 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
32258 as that is a cheaper way to load all ones into
32259 a register than having to load a constant from
32260 memory. */
32261 gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
32262 if (is_gimple_call (def_stmt))
32264 tree fndecl = gimple_call_fndecl (def_stmt);
32265 if (fndecl
32266 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
32267 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
32269 case IX86_BUILTIN_CMPPD:
32270 case IX86_BUILTIN_CMPPS:
32271 case IX86_BUILTIN_CMPPD256:
32272 case IX86_BUILTIN_CMPPS256:
32273 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
32274 break;
32275 /* FALLTHRU */
32276 case IX86_BUILTIN_CMPEQPD:
32277 case IX86_BUILTIN_CMPEQPS:
32278 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
32279 && initializer_zerop (gimple_call_arg (def_stmt,
32280 1)))
32281 op0 = pc_rtx;
32282 break;
32283 default:
32284 break;
32290 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
32291 if (! pat)
32292 return const0_rtx;
32293 emit_insn (pat);
32295 if (fcode == IX86_BUILTIN_GATHERDIV8SF
32296 || fcode == IX86_BUILTIN_GATHERDIV8SI)
32298 enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
32299 ? V4SFmode : V4SImode;
32300 if (target == NULL_RTX)
32301 target = gen_reg_rtx (tmode);
32302 if (tmode == V4SFmode)
32303 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
32304 else
32305 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
32307 else
32308 target = subtarget;
32310 return target;
32312 case IX86_BUILTIN_XABORT:
32313 icode = CODE_FOR_xabort;
32314 arg0 = CALL_EXPR_ARG (exp, 0);
32315 op0 = expand_normal (arg0);
32316 mode0 = insn_data[icode].operand[0].mode;
32317 if (!insn_data[icode].operand[0].predicate (op0, mode0))
32319 error ("the xabort's argument must be an 8-bit immediate");
32320 return const0_rtx;
32322 emit_insn (gen_xabort (op0));
32323 return 0;
32325 default:
32326 break;
32329 for (i = 0, d = bdesc_special_args;
32330 i < ARRAY_SIZE (bdesc_special_args);
32331 i++, d++)
32332 if (d->code == fcode)
32333 return ix86_expand_special_args_builtin (d, exp, target);
32335 for (i = 0, d = bdesc_args;
32336 i < ARRAY_SIZE (bdesc_args);
32337 i++, d++)
32338 if (d->code == fcode)
32339 switch (fcode)
32341 case IX86_BUILTIN_FABSQ:
32342 case IX86_BUILTIN_COPYSIGNQ:
32343 if (!TARGET_SSE)
32344 /* Emit a normal call if SSE isn't available. */
32345 return expand_call (exp, target, ignore);
32346 default:
32347 return ix86_expand_args_builtin (d, exp, target);
32350 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
32351 if (d->code == fcode)
32352 return ix86_expand_sse_comi (d, exp, target);
32354 for (i = 0, d = bdesc_pcmpestr;
32355 i < ARRAY_SIZE (bdesc_pcmpestr);
32356 i++, d++)
32357 if (d->code == fcode)
32358 return ix86_expand_sse_pcmpestr (d, exp, target);
32360 for (i = 0, d = bdesc_pcmpistr;
32361 i < ARRAY_SIZE (bdesc_pcmpistr);
32362 i++, d++)
32363 if (d->code == fcode)
32364 return ix86_expand_sse_pcmpistr (d, exp, target);
32366 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
32367 if (d->code == fcode)
32368 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
32369 (enum ix86_builtin_func_type)
32370 d->flag, d->comparison);
32372 gcc_unreachable ();
32375 /* Returns a function decl for a vectorized version of the builtin function
32376 with builtin function code FN and the result vector type TYPE, or NULL_TREE
32377 if it is not available. */
32379 static tree
32380 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
32381 tree type_in)
32383 enum machine_mode in_mode, out_mode;
32384 int in_n, out_n;
32385 enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
32387 if (TREE_CODE (type_out) != VECTOR_TYPE
32388 || TREE_CODE (type_in) != VECTOR_TYPE
32389 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
32390 return NULL_TREE;
32392 out_mode = TYPE_MODE (TREE_TYPE (type_out));
32393 out_n = TYPE_VECTOR_SUBPARTS (type_out);
32394 in_mode = TYPE_MODE (TREE_TYPE (type_in));
32395 in_n = TYPE_VECTOR_SUBPARTS (type_in);
32397 switch (fn)
32399 case BUILT_IN_SQRT:
32400 if (out_mode == DFmode && in_mode == DFmode)
32402 if (out_n == 2 && in_n == 2)
32403 return ix86_builtins[IX86_BUILTIN_SQRTPD];
32404 else if (out_n == 4 && in_n == 4)
32405 return ix86_builtins[IX86_BUILTIN_SQRTPD256];
32407 break;
32409 case BUILT_IN_SQRTF:
32410 if (out_mode == SFmode && in_mode == SFmode)
32412 if (out_n == 4 && in_n == 4)
32413 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
32414 else if (out_n == 8 && in_n == 8)
32415 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
32417 break;
32419 case BUILT_IN_IFLOOR:
32420 case BUILT_IN_LFLOOR:
32421 case BUILT_IN_LLFLOOR:
32422 /* The round insn does not trap on denormals. */
32423 if (flag_trapping_math || !TARGET_ROUND)
32424 break;
32426 if (out_mode == SImode && in_mode == DFmode)
32428 if (out_n == 4 && in_n == 2)
32429 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX];
32430 else if (out_n == 8 && in_n == 4)
32431 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256];
32433 break;
32435 case BUILT_IN_IFLOORF:
32436 case BUILT_IN_LFLOORF:
32437 case BUILT_IN_LLFLOORF:
32438 /* The round insn does not trap on denormals. */
32439 if (flag_trapping_math || !TARGET_ROUND)
32440 break;
32442 if (out_mode == SImode && in_mode == SFmode)
32444 if (out_n == 4 && in_n == 4)
32445 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX];
32446 else if (out_n == 8 && in_n == 8)
32447 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX256];
32449 break;
32451 case BUILT_IN_ICEIL:
32452 case BUILT_IN_LCEIL:
32453 case BUILT_IN_LLCEIL:
32454 /* The round insn does not trap on denormals. */
32455 if (flag_trapping_math || !TARGET_ROUND)
32456 break;
32458 if (out_mode == SImode && in_mode == DFmode)
32460 if (out_n == 4 && in_n == 2)
32461 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX];
32462 else if (out_n == 8 && in_n == 4)
32463 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256];
32465 break;
32467 case BUILT_IN_ICEILF:
32468 case BUILT_IN_LCEILF:
32469 case BUILT_IN_LLCEILF:
32470 /* The round insn does not trap on denormals. */
32471 if (flag_trapping_math || !TARGET_ROUND)
32472 break;
32474 if (out_mode == SImode && in_mode == SFmode)
32476 if (out_n == 4 && in_n == 4)
32477 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX];
32478 else if (out_n == 8 && in_n == 8)
32479 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX256];
32481 break;
32483 case BUILT_IN_IRINT:
32484 case BUILT_IN_LRINT:
32485 case BUILT_IN_LLRINT:
32486 if (out_mode == SImode && in_mode == DFmode)
32488 if (out_n == 4 && in_n == 2)
32489 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
32490 else if (out_n == 8 && in_n == 4)
32491 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX256];
32493 break;
32495 case BUILT_IN_IRINTF:
32496 case BUILT_IN_LRINTF:
32497 case BUILT_IN_LLRINTF:
32498 if (out_mode == SImode && in_mode == SFmode)
32500 if (out_n == 4 && in_n == 4)
32501 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
32502 else if (out_n == 8 && in_n == 8)
32503 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
32505 break;
32507 case BUILT_IN_IROUND:
32508 case BUILT_IN_LROUND:
32509 case BUILT_IN_LLROUND:
32510 /* The round insn does not trap on denormals. */
32511 if (flag_trapping_math || !TARGET_ROUND)
32512 break;
32514 if (out_mode == SImode && in_mode == DFmode)
32516 if (out_n == 4 && in_n == 2)
32517 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX];
32518 else if (out_n == 8 && in_n == 4)
32519 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256];
32521 break;
32523 case BUILT_IN_IROUNDF:
32524 case BUILT_IN_LROUNDF:
32525 case BUILT_IN_LLROUNDF:
32526 /* The round insn does not trap on denormals. */
32527 if (flag_trapping_math || !TARGET_ROUND)
32528 break;
32530 if (out_mode == SImode && in_mode == SFmode)
32532 if (out_n == 4 && in_n == 4)
32533 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX];
32534 else if (out_n == 8 && in_n == 8)
32535 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX256];
32537 break;
32539 case BUILT_IN_COPYSIGN:
32540 if (out_mode == DFmode && in_mode == DFmode)
32542 if (out_n == 2 && in_n == 2)
32543 return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
32544 else if (out_n == 4 && in_n == 4)
32545 return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
32547 break;
32549 case BUILT_IN_COPYSIGNF:
32550 if (out_mode == SFmode && in_mode == SFmode)
32552 if (out_n == 4 && in_n == 4)
32553 return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
32554 else if (out_n == 8 && in_n == 8)
32555 return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
32557 break;
32559 case BUILT_IN_FLOOR:
32560 /* The round insn does not trap on denormals. */
32561 if (flag_trapping_math || !TARGET_ROUND)
32562 break;
32564 if (out_mode == DFmode && in_mode == DFmode)
32566 if (out_n == 2 && in_n == 2)
32567 return ix86_builtins[IX86_BUILTIN_FLOORPD];
32568 else if (out_n == 4 && in_n == 4)
32569 return ix86_builtins[IX86_BUILTIN_FLOORPD256];
32571 break;
32573 case BUILT_IN_FLOORF:
32574 /* The round insn does not trap on denormals. */
32575 if (flag_trapping_math || !TARGET_ROUND)
32576 break;
32578 if (out_mode == SFmode && in_mode == SFmode)
32580 if (out_n == 4 && in_n == 4)
32581 return ix86_builtins[IX86_BUILTIN_FLOORPS];
32582 else if (out_n == 8 && in_n == 8)
32583 return ix86_builtins[IX86_BUILTIN_FLOORPS256];
32585 break;
32587 case BUILT_IN_CEIL:
32588 /* The round insn does not trap on denormals. */
32589 if (flag_trapping_math || !TARGET_ROUND)
32590 break;
32592 if (out_mode == DFmode && in_mode == DFmode)
32594 if (out_n == 2 && in_n == 2)
32595 return ix86_builtins[IX86_BUILTIN_CEILPD];
32596 else if (out_n == 4 && in_n == 4)
32597 return ix86_builtins[IX86_BUILTIN_CEILPD256];
32599 break;
32601 case BUILT_IN_CEILF:
32602 /* The round insn does not trap on denormals. */
32603 if (flag_trapping_math || !TARGET_ROUND)
32604 break;
32606 if (out_mode == SFmode && in_mode == SFmode)
32608 if (out_n == 4 && in_n == 4)
32609 return ix86_builtins[IX86_BUILTIN_CEILPS];
32610 else if (out_n == 8 && in_n == 8)
32611 return ix86_builtins[IX86_BUILTIN_CEILPS256];
32613 break;
32615 case BUILT_IN_TRUNC:
32616 /* The round insn does not trap on denormals. */
32617 if (flag_trapping_math || !TARGET_ROUND)
32618 break;
32620 if (out_mode == DFmode && in_mode == DFmode)
32622 if (out_n == 2 && in_n == 2)
32623 return ix86_builtins[IX86_BUILTIN_TRUNCPD];
32624 else if (out_n == 4 && in_n == 4)
32625 return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
32627 break;
32629 case BUILT_IN_TRUNCF:
32630 /* The round insn does not trap on denormals. */
32631 if (flag_trapping_math || !TARGET_ROUND)
32632 break;
32634 if (out_mode == SFmode && in_mode == SFmode)
32636 if (out_n == 4 && in_n == 4)
32637 return ix86_builtins[IX86_BUILTIN_TRUNCPS];
32638 else if (out_n == 8 && in_n == 8)
32639 return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
32641 break;
32643 case BUILT_IN_RINT:
32644 /* The round insn does not trap on denormals. */
32645 if (flag_trapping_math || !TARGET_ROUND)
32646 break;
32648 if (out_mode == DFmode && in_mode == DFmode)
32650 if (out_n == 2 && in_n == 2)
32651 return ix86_builtins[IX86_BUILTIN_RINTPD];
32652 else if (out_n == 4 && in_n == 4)
32653 return ix86_builtins[IX86_BUILTIN_RINTPD256];
32655 break;
32657 case BUILT_IN_RINTF:
32658 /* The round insn does not trap on denormals. */
32659 if (flag_trapping_math || !TARGET_ROUND)
32660 break;
32662 if (out_mode == SFmode && in_mode == SFmode)
32664 if (out_n == 4 && in_n == 4)
32665 return ix86_builtins[IX86_BUILTIN_RINTPS];
32666 else if (out_n == 8 && in_n == 8)
32667 return ix86_builtins[IX86_BUILTIN_RINTPS256];
32669 break;
32671 case BUILT_IN_ROUND:
32672 /* The round insn does not trap on denormals. */
32673 if (flag_trapping_math || !TARGET_ROUND)
32674 break;
32676 if (out_mode == DFmode && in_mode == DFmode)
32678 if (out_n == 2 && in_n == 2)
32679 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ];
32680 else if (out_n == 4 && in_n == 4)
32681 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ256];
32683 break;
32685 case BUILT_IN_ROUNDF:
32686 /* The round insn does not trap on denormals. */
32687 if (flag_trapping_math || !TARGET_ROUND)
32688 break;
32690 if (out_mode == SFmode && in_mode == SFmode)
32692 if (out_n == 4 && in_n == 4)
32693 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ];
32694 else if (out_n == 8 && in_n == 8)
32695 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ256];
32697 break;
32699 case BUILT_IN_FMA:
32700 if (out_mode == DFmode && in_mode == DFmode)
32702 if (out_n == 2 && in_n == 2)
32703 return ix86_builtins[IX86_BUILTIN_VFMADDPD];
32704 if (out_n == 4 && in_n == 4)
32705 return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
32707 break;
32709 case BUILT_IN_FMAF:
32710 if (out_mode == SFmode && in_mode == SFmode)
32712 if (out_n == 4 && in_n == 4)
32713 return ix86_builtins[IX86_BUILTIN_VFMADDPS];
32714 if (out_n == 8 && in_n == 8)
32715 return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
32717 break;
32719 default:
32720 break;
32723 /* Dispatch to a handler for a vectorization library. */
32724 if (ix86_veclib_handler)
32725 return ix86_veclib_handler ((enum built_in_function) fn, type_out,
32726 type_in);
32728 return NULL_TREE;
32731 /* Handler for an SVML-style interface to
32732 a library with vectorized intrinsics. */
32734 static tree
32735 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
32737 char name[20];
32738 tree fntype, new_fndecl, args;
32739 unsigned arity;
32740 const char *bname;
32741 enum machine_mode el_mode, in_mode;
32742 int n, in_n;
32744 /* The SVML is suitable for unsafe math only. */
32745 if (!flag_unsafe_math_optimizations)
32746 return NULL_TREE;
32748 el_mode = TYPE_MODE (TREE_TYPE (type_out));
32749 n = TYPE_VECTOR_SUBPARTS (type_out);
32750 in_mode = TYPE_MODE (TREE_TYPE (type_in));
32751 in_n = TYPE_VECTOR_SUBPARTS (type_in);
32752 if (el_mode != in_mode
32753 || n != in_n)
32754 return NULL_TREE;
32756 switch (fn)
32758 case BUILT_IN_EXP:
32759 case BUILT_IN_LOG:
32760 case BUILT_IN_LOG10:
32761 case BUILT_IN_POW:
32762 case BUILT_IN_TANH:
32763 case BUILT_IN_TAN:
32764 case BUILT_IN_ATAN:
32765 case BUILT_IN_ATAN2:
32766 case BUILT_IN_ATANH:
32767 case BUILT_IN_CBRT:
32768 case BUILT_IN_SINH:
32769 case BUILT_IN_SIN:
32770 case BUILT_IN_ASINH:
32771 case BUILT_IN_ASIN:
32772 case BUILT_IN_COSH:
32773 case BUILT_IN_COS:
32774 case BUILT_IN_ACOSH:
32775 case BUILT_IN_ACOS:
32776 if (el_mode != DFmode || n != 2)
32777 return NULL_TREE;
32778 break;
32780 case BUILT_IN_EXPF:
32781 case BUILT_IN_LOGF:
32782 case BUILT_IN_LOG10F:
32783 case BUILT_IN_POWF:
32784 case BUILT_IN_TANHF:
32785 case BUILT_IN_TANF:
32786 case BUILT_IN_ATANF:
32787 case BUILT_IN_ATAN2F:
32788 case BUILT_IN_ATANHF:
32789 case BUILT_IN_CBRTF:
32790 case BUILT_IN_SINHF:
32791 case BUILT_IN_SINF:
32792 case BUILT_IN_ASINHF:
32793 case BUILT_IN_ASINF:
32794 case BUILT_IN_COSHF:
32795 case BUILT_IN_COSF:
32796 case BUILT_IN_ACOSHF:
32797 case BUILT_IN_ACOSF:
32798 if (el_mode != SFmode || n != 4)
32799 return NULL_TREE;
32800 break;
32802 default:
32803 return NULL_TREE;
32806 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
32808 if (fn == BUILT_IN_LOGF)
32809 strcpy (name, "vmlsLn4");
32810 else if (fn == BUILT_IN_LOG)
32811 strcpy (name, "vmldLn2");
32812 else if (n == 4)
32814 sprintf (name, "vmls%s", bname+10);
32815 name[strlen (name)-1] = '4';
32817 else
32818 sprintf (name, "vmld%s2", bname+10);
32820 /* Convert to uppercase. */
32821 name[4] &= ~0x20;
32823 arity = 0;
32824 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
32825 args;
32826 args = TREE_CHAIN (args))
32827 arity++;
32829 if (arity == 1)
32830 fntype = build_function_type_list (type_out, type_in, NULL);
32831 else
32832 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
32834 /* Build a function declaration for the vectorized function. */
32835 new_fndecl = build_decl (BUILTINS_LOCATION,
32836 FUNCTION_DECL, get_identifier (name), fntype);
32837 TREE_PUBLIC (new_fndecl) = 1;
32838 DECL_EXTERNAL (new_fndecl) = 1;
32839 DECL_IS_NOVOPS (new_fndecl) = 1;
32840 TREE_READONLY (new_fndecl) = 1;
32842 return new_fndecl;
32845 /* Handler for an ACML-style interface to
32846 a library with vectorized intrinsics. */
32848 static tree
32849 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
32851 char name[20] = "__vr.._";
32852 tree fntype, new_fndecl, args;
32853 unsigned arity;
32854 const char *bname;
32855 enum machine_mode el_mode, in_mode;
32856 int n, in_n;
32858 /* The ACML is 64bits only and suitable for unsafe math only as
32859 it does not correctly support parts of IEEE with the required
32860 precision such as denormals. */
32861 if (!TARGET_64BIT
32862 || !flag_unsafe_math_optimizations)
32863 return NULL_TREE;
32865 el_mode = TYPE_MODE (TREE_TYPE (type_out));
32866 n = TYPE_VECTOR_SUBPARTS (type_out);
32867 in_mode = TYPE_MODE (TREE_TYPE (type_in));
32868 in_n = TYPE_VECTOR_SUBPARTS (type_in);
32869 if (el_mode != in_mode
32870 || n != in_n)
32871 return NULL_TREE;
32873 switch (fn)
32875 case BUILT_IN_SIN:
32876 case BUILT_IN_COS:
32877 case BUILT_IN_EXP:
32878 case BUILT_IN_LOG:
32879 case BUILT_IN_LOG2:
32880 case BUILT_IN_LOG10:
32881 name[4] = 'd';
32882 name[5] = '2';
32883 if (el_mode != DFmode
32884 || n != 2)
32885 return NULL_TREE;
32886 break;
32888 case BUILT_IN_SINF:
32889 case BUILT_IN_COSF:
32890 case BUILT_IN_EXPF:
32891 case BUILT_IN_POWF:
32892 case BUILT_IN_LOGF:
32893 case BUILT_IN_LOG2F:
32894 case BUILT_IN_LOG10F:
32895 name[4] = 's';
32896 name[5] = '4';
32897 if (el_mode != SFmode
32898 || n != 4)
32899 return NULL_TREE;
32900 break;
32902 default:
32903 return NULL_TREE;
32906 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
32907 sprintf (name + 7, "%s", bname+10);
32909 arity = 0;
32910 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
32911 args;
32912 args = TREE_CHAIN (args))
32913 arity++;
32915 if (arity == 1)
32916 fntype = build_function_type_list (type_out, type_in, NULL);
32917 else
32918 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
32920 /* Build a function declaration for the vectorized function. */
32921 new_fndecl = build_decl (BUILTINS_LOCATION,
32922 FUNCTION_DECL, get_identifier (name), fntype);
32923 TREE_PUBLIC (new_fndecl) = 1;
32924 DECL_EXTERNAL (new_fndecl) = 1;
32925 DECL_IS_NOVOPS (new_fndecl) = 1;
32926 TREE_READONLY (new_fndecl) = 1;
32928 return new_fndecl;
32931 /* Returns a decl of a function that implements gather load with
32932 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
32933 Return NULL_TREE if it is not available. */
32935 static tree
32936 ix86_vectorize_builtin_gather (const_tree mem_vectype,
32937 const_tree index_type, int scale)
32939 bool si;
32940 enum ix86_builtins code;
32942 if (! TARGET_AVX2)
32943 return NULL_TREE;
32945 if ((TREE_CODE (index_type) != INTEGER_TYPE
32946 && !POINTER_TYPE_P (index_type))
32947 || (TYPE_MODE (index_type) != SImode
32948 && TYPE_MODE (index_type) != DImode))
32949 return NULL_TREE;
32951 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
32952 return NULL_TREE;
32954 /* v*gather* insn sign extends index to pointer mode. */
32955 if (TYPE_PRECISION (index_type) < POINTER_SIZE
32956 && TYPE_UNSIGNED (index_type))
32957 return NULL_TREE;
32959 if (scale <= 0
32960 || scale > 8
32961 || (scale & (scale - 1)) != 0)
32962 return NULL_TREE;
32964 si = TYPE_MODE (index_type) == SImode;
32965 switch (TYPE_MODE (mem_vectype))
32967 case V2DFmode:
32968 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
32969 break;
32970 case V4DFmode:
32971 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
32972 break;
32973 case V2DImode:
32974 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
32975 break;
32976 case V4DImode:
32977 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
32978 break;
32979 case V4SFmode:
32980 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
32981 break;
32982 case V8SFmode:
32983 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
32984 break;
32985 case V4SImode:
32986 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
32987 break;
32988 case V8SImode:
32989 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
32990 break;
32991 default:
32992 return NULL_TREE;
32995 return ix86_builtins[code];
32998 /* Returns a code for a target-specific builtin that implements
32999 reciprocal of the function, or NULL_TREE if not available. */
33001 static tree
33002 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
33003 bool sqrt ATTRIBUTE_UNUSED)
33005 if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
33006 && flag_finite_math_only && !flag_trapping_math
33007 && flag_unsafe_math_optimizations))
33008 return NULL_TREE;
33010 if (md_fn)
33011 /* Machine dependent builtins. */
33012 switch (fn)
33014 /* Vectorized version of sqrt to rsqrt conversion. */
33015 case IX86_BUILTIN_SQRTPS_NR:
33016 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
33018 case IX86_BUILTIN_SQRTPS_NR256:
33019 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
33021 default:
33022 return NULL_TREE;
33024 else
33025 /* Normal builtins. */
33026 switch (fn)
33028 /* Sqrt to rsqrt conversion. */
33029 case BUILT_IN_SQRTF:
33030 return ix86_builtins[IX86_BUILTIN_RSQRTF];
33032 default:
33033 return NULL_TREE;
33037 /* Helper for avx_vpermilps256_operand et al. This is also used by
33038 the expansion functions to turn the parallel back into a mask.
33039 The return value is 0 for no match and the imm8+1 for a match. */
33042 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
33044 unsigned i, nelt = GET_MODE_NUNITS (mode);
33045 unsigned mask = 0;
33046 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
33048 if (XVECLEN (par, 0) != (int) nelt)
33049 return 0;
33051 /* Validate that all of the elements are constants, and not totally
33052 out of range. Copy the data into an integral array to make the
33053 subsequent checks easier. */
33054 for (i = 0; i < nelt; ++i)
33056 rtx er = XVECEXP (par, 0, i);
33057 unsigned HOST_WIDE_INT ei;
33059 if (!CONST_INT_P (er))
33060 return 0;
33061 ei = INTVAL (er);
33062 if (ei >= nelt)
33063 return 0;
33064 ipar[i] = ei;
33067 switch (mode)
33069 case V4DFmode:
33070 /* In the 256-bit DFmode case, we can only move elements within
33071 a 128-bit lane. */
33072 for (i = 0; i < 2; ++i)
33074 if (ipar[i] >= 2)
33075 return 0;
33076 mask |= ipar[i] << i;
33078 for (i = 2; i < 4; ++i)
33080 if (ipar[i] < 2)
33081 return 0;
33082 mask |= (ipar[i] - 2) << i;
33084 break;
33086 case V8SFmode:
33087 /* In the 256-bit SFmode case, we have full freedom of movement
33088 within the low 128-bit lane, but the high 128-bit lane must
33089 mirror the exact same pattern. */
33090 for (i = 0; i < 4; ++i)
33091 if (ipar[i] + 4 != ipar[i + 4])
33092 return 0;
33093 nelt = 4;
33094 /* FALLTHRU */
33096 case V2DFmode:
33097 case V4SFmode:
33098 /* In the 128-bit case, we've full freedom in the placement of
33099 the elements from the source operand. */
33100 for (i = 0; i < nelt; ++i)
33101 mask |= ipar[i] << (i * (nelt / 2));
33102 break;
33104 default:
33105 gcc_unreachable ();
33108 /* Make sure success has a non-zero value by adding one. */
33109 return mask + 1;
33112 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
33113 the expansion functions to turn the parallel back into a mask.
33114 The return value is 0 for no match and the imm8+1 for a match. */
33117 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
33119 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
33120 unsigned mask = 0;
33121 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
33123 if (XVECLEN (par, 0) != (int) nelt)
33124 return 0;
33126 /* Validate that all of the elements are constants, and not totally
33127 out of range. Copy the data into an integral array to make the
33128 subsequent checks easier. */
33129 for (i = 0; i < nelt; ++i)
33131 rtx er = XVECEXP (par, 0, i);
33132 unsigned HOST_WIDE_INT ei;
33134 if (!CONST_INT_P (er))
33135 return 0;
33136 ei = INTVAL (er);
33137 if (ei >= 2 * nelt)
33138 return 0;
33139 ipar[i] = ei;
33142 /* Validate that the halves of the permute are halves. */
33143 for (i = 0; i < nelt2 - 1; ++i)
33144 if (ipar[i] + 1 != ipar[i + 1])
33145 return 0;
33146 for (i = nelt2; i < nelt - 1; ++i)
33147 if (ipar[i] + 1 != ipar[i + 1])
33148 return 0;
33150 /* Reconstruct the mask. */
33151 for (i = 0; i < 2; ++i)
33153 unsigned e = ipar[i * nelt2];
33154 if (e % nelt2)
33155 return 0;
33156 e /= nelt2;
33157 mask |= e << (i * 4);
33160 /* Make sure success has a non-zero value by adding one. */
33161 return mask + 1;
33164 /* Store OPERAND to the memory after reload is completed. This means
33165 that we can't easily use assign_stack_local. */
33167 ix86_force_to_memory (enum machine_mode mode, rtx operand)
33169 rtx result;
33171 gcc_assert (reload_completed);
33172 if (ix86_using_red_zone ())
33174 result = gen_rtx_MEM (mode,
33175 gen_rtx_PLUS (Pmode,
33176 stack_pointer_rtx,
33177 GEN_INT (-RED_ZONE_SIZE)));
33178 emit_move_insn (result, operand);
33180 else if (TARGET_64BIT)
33182 switch (mode)
33184 case HImode:
33185 case SImode:
33186 operand = gen_lowpart (DImode, operand);
33187 /* FALLTHRU */
33188 case DImode:
33189 emit_insn (
33190 gen_rtx_SET (VOIDmode,
33191 gen_rtx_MEM (DImode,
33192 gen_rtx_PRE_DEC (DImode,
33193 stack_pointer_rtx)),
33194 operand));
33195 break;
33196 default:
33197 gcc_unreachable ();
33199 result = gen_rtx_MEM (mode, stack_pointer_rtx);
33201 else
33203 switch (mode)
33205 case DImode:
33207 rtx operands[2];
33208 split_double_mode (mode, &operand, 1, operands, operands + 1);
33209 emit_insn (
33210 gen_rtx_SET (VOIDmode,
33211 gen_rtx_MEM (SImode,
33212 gen_rtx_PRE_DEC (Pmode,
33213 stack_pointer_rtx)),
33214 operands[1]));
33215 emit_insn (
33216 gen_rtx_SET (VOIDmode,
33217 gen_rtx_MEM (SImode,
33218 gen_rtx_PRE_DEC (Pmode,
33219 stack_pointer_rtx)),
33220 operands[0]));
33222 break;
33223 case HImode:
33224 /* Store HImodes as SImodes. */
33225 operand = gen_lowpart (SImode, operand);
33226 /* FALLTHRU */
33227 case SImode:
33228 emit_insn (
33229 gen_rtx_SET (VOIDmode,
33230 gen_rtx_MEM (GET_MODE (operand),
33231 gen_rtx_PRE_DEC (SImode,
33232 stack_pointer_rtx)),
33233 operand));
33234 break;
33235 default:
33236 gcc_unreachable ();
33238 result = gen_rtx_MEM (mode, stack_pointer_rtx);
33240 return result;
33243 /* Free operand from the memory. */
33244 void
33245 ix86_free_from_memory (enum machine_mode mode)
33247 if (!ix86_using_red_zone ())
33249 int size;
33251 if (mode == DImode || TARGET_64BIT)
33252 size = 8;
33253 else
33254 size = 4;
33255 /* Use LEA to deallocate stack space. In peephole2 it will be converted
33256 to pop or add instruction if registers are available. */
33257 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
33258 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
33259 GEN_INT (size))));
33263 /* Return a register priority for hard reg REGNO. */
33264 static int
33265 ix86_register_priority (int hard_regno)
33267 /* ebp and r13 as the base always wants a displacement, r12 as the
33268 base always wants an index. So discourage their usage in an
33269 address. */
33270 if (hard_regno == R12_REG || hard_regno == R13_REG)
33271 return 0;
33272 if (hard_regno == BP_REG)
33273 return 1;
33274 /* New x86-64 int registers result in bigger code size. Discourage
33275 them. */
33276 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
33277 return 2;
33278 /* New x86-64 SSE registers result in bigger code size. Discourage
33279 them. */
33280 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
33281 return 2;
33282 /* Usage of AX register results in smaller code. Prefer it. */
33283 if (hard_regno == 0)
33284 return 4;
33285 return 3;
33288 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
33290 Put float CONST_DOUBLE in the constant pool instead of fp regs.
33291 QImode must go into class Q_REGS.
33292 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
33293 movdf to do mem-to-mem moves through integer regs. */
33295 static reg_class_t
33296 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
33298 enum machine_mode mode = GET_MODE (x);
33300 /* We're only allowed to return a subclass of CLASS. Many of the
33301 following checks fail for NO_REGS, so eliminate that early. */
33302 if (regclass == NO_REGS)
33303 return NO_REGS;
33305 /* All classes can load zeros. */
33306 if (x == CONST0_RTX (mode))
33307 return regclass;
33309 /* Force constants into memory if we are loading a (nonzero) constant into
33310 an MMX or SSE register. This is because there are no MMX/SSE instructions
33311 to load from a constant. */
33312 if (CONSTANT_P (x)
33313 && (MAYBE_MMX_CLASS_P (regclass) || MAYBE_SSE_CLASS_P (regclass)))
33314 return NO_REGS;
33316 /* Prefer SSE regs only, if we can use them for math. */
33317 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
33318 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
33320 /* Floating-point constants need more complex checks. */
33321 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
33323 /* General regs can load everything. */
33324 if (reg_class_subset_p (regclass, GENERAL_REGS))
33325 return regclass;
33327 /* Floats can load 0 and 1 plus some others. Note that we eliminated
33328 zero above. We only want to wind up preferring 80387 registers if
33329 we plan on doing computation with them. */
33330 if (TARGET_80387
33331 && standard_80387_constant_p (x) > 0)
33333 /* Limit class to non-sse. */
33334 if (regclass == FLOAT_SSE_REGS)
33335 return FLOAT_REGS;
33336 if (regclass == FP_TOP_SSE_REGS)
33337 return FP_TOP_REG;
33338 if (regclass == FP_SECOND_SSE_REGS)
33339 return FP_SECOND_REG;
33340 if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
33341 return regclass;
33344 return NO_REGS;
33347 /* Generally when we see PLUS here, it's the function invariant
33348 (plus soft-fp const_int). Which can only be computed into general
33349 regs. */
33350 if (GET_CODE (x) == PLUS)
33351 return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
33353 /* QImode constants are easy to load, but non-constant QImode data
33354 must go into Q_REGS. */
33355 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
33357 if (reg_class_subset_p (regclass, Q_REGS))
33358 return regclass;
33359 if (reg_class_subset_p (Q_REGS, regclass))
33360 return Q_REGS;
33361 return NO_REGS;
33364 return regclass;
33367 /* Discourage putting floating-point values in SSE registers unless
33368 SSE math is being used, and likewise for the 387 registers. */
33369 static reg_class_t
33370 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
33372 enum machine_mode mode = GET_MODE (x);
33374 /* Restrict the output reload class to the register bank that we are doing
33375 math on. If we would like not to return a subset of CLASS, reject this
33376 alternative: if reload cannot do this, it will still use its choice. */
33377 mode = GET_MODE (x);
33378 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
33379 return MAYBE_SSE_CLASS_P (regclass) ? SSE_REGS : NO_REGS;
33381 if (X87_FLOAT_MODE_P (mode))
33383 if (regclass == FP_TOP_SSE_REGS)
33384 return FP_TOP_REG;
33385 else if (regclass == FP_SECOND_SSE_REGS)
33386 return FP_SECOND_REG;
33387 else
33388 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
33391 return regclass;
33394 static reg_class_t
33395 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
33396 enum machine_mode mode, secondary_reload_info *sri)
33398 /* Double-word spills from general registers to non-offsettable memory
33399 references (zero-extended addresses) require special handling. */
33400 if (TARGET_64BIT
33401 && MEM_P (x)
33402 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
33403 && rclass == GENERAL_REGS
33404 && !offsettable_memref_p (x))
33406 sri->icode = (in_p
33407 ? CODE_FOR_reload_noff_load
33408 : CODE_FOR_reload_noff_store);
33409 /* Add the cost of moving address to a temporary. */
33410 sri->extra_cost = 1;
33412 return NO_REGS;
33415 /* QImode spills from non-QI registers require
33416 intermediate register on 32bit targets. */
33417 if (!TARGET_64BIT
33418 && !in_p && mode == QImode
33419 && (rclass == GENERAL_REGS
33420 || rclass == LEGACY_REGS
33421 || rclass == NON_Q_REGS
33422 || rclass == SIREG
33423 || rclass == DIREG
33424 || rclass == INDEX_REGS))
33426 int regno;
33428 if (REG_P (x))
33429 regno = REGNO (x);
33430 else
33431 regno = -1;
33433 if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
33434 regno = true_regnum (x);
33436 /* Return Q_REGS if the operand is in memory. */
33437 if (regno == -1)
33438 return Q_REGS;
33441 /* This condition handles corner case where an expression involving
33442 pointers gets vectorized. We're trying to use the address of a
33443 stack slot as a vector initializer.
33445 (set (reg:V2DI 74 [ vect_cst_.2 ])
33446 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
33448 Eventually frame gets turned into sp+offset like this:
33450 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33451 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
33452 (const_int 392 [0x188]))))
33454 That later gets turned into:
33456 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33457 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
33458 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
33460 We'll have the following reload recorded:
33462 Reload 0: reload_in (DI) =
33463 (plus:DI (reg/f:DI 7 sp)
33464 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
33465 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33466 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
33467 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
33468 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33469 reload_reg_rtx: (reg:V2DI 22 xmm1)
33471 Which isn't going to work since SSE instructions can't handle scalar
33472 additions. Returning GENERAL_REGS forces the addition into integer
33473 register and reload can handle subsequent reloads without problems. */
33475 if (in_p && GET_CODE (x) == PLUS
33476 && SSE_CLASS_P (rclass)
33477 && SCALAR_INT_MODE_P (mode))
33478 return GENERAL_REGS;
33480 return NO_REGS;
33483 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
33485 static bool
33486 ix86_class_likely_spilled_p (reg_class_t rclass)
33488 switch (rclass)
33490 case AREG:
33491 case DREG:
33492 case CREG:
33493 case BREG:
33494 case AD_REGS:
33495 case SIREG:
33496 case DIREG:
33497 case SSE_FIRST_REG:
33498 case FP_TOP_REG:
33499 case FP_SECOND_REG:
33500 return true;
33502 default:
33503 break;
33506 return false;
33509 /* If we are copying between general and FP registers, we need a memory
33510 location. The same is true for SSE and MMX registers.
33512 To optimize register_move_cost performance, allow inline variant.
33514 The macro can't work reliably when one of the CLASSES is class containing
33515 registers from multiple units (SSE, MMX, integer). We avoid this by never
33516 combining those units in single alternative in the machine description.
33517 Ensure that this constraint holds to avoid unexpected surprises.
33519 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
33520 enforce these sanity checks. */
33522 static inline bool
33523 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
33524 enum machine_mode mode, int strict)
33526 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
33527 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
33528 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
33529 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
33530 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
33531 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
33533 gcc_assert (!strict || lra_in_progress);
33534 return true;
33537 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
33538 return true;
33540 /* ??? This is a lie. We do have moves between mmx/general, and for
33541 mmx/sse2. But by saying we need secondary memory we discourage the
33542 register allocator from using the mmx registers unless needed. */
33543 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
33544 return true;
33546 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
33548 /* SSE1 doesn't have any direct moves from other classes. */
33549 if (!TARGET_SSE2)
33550 return true;
33552 /* If the target says that inter-unit moves are more expensive
33553 than moving through memory, then don't generate them. */
33554 if (!TARGET_INTER_UNIT_MOVES)
33555 return true;
33557 /* Between SSE and general, we have moves no larger than word size. */
33558 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
33559 return true;
33562 return false;
33565 bool
33566 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
33567 enum machine_mode mode, int strict)
33569 return inline_secondary_memory_needed (class1, class2, mode, strict);
33572 /* Implement the TARGET_CLASS_MAX_NREGS hook.
33574 On the 80386, this is the size of MODE in words,
33575 except in the FP regs, where a single reg is always enough. */
33577 static unsigned char
33578 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
33580 if (MAYBE_INTEGER_CLASS_P (rclass))
33582 if (mode == XFmode)
33583 return (TARGET_64BIT ? 2 : 3);
33584 else if (mode == XCmode)
33585 return (TARGET_64BIT ? 4 : 6);
33586 else
33587 return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
33589 else
33591 if (COMPLEX_MODE_P (mode))
33592 return 2;
33593 else
33594 return 1;
33598 /* Return true if the registers in CLASS cannot represent the change from
33599 modes FROM to TO. */
33601 bool
33602 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
33603 enum reg_class regclass)
33605 if (from == to)
33606 return false;
33608 /* x87 registers can't do subreg at all, as all values are reformatted
33609 to extended precision. */
33610 if (MAYBE_FLOAT_CLASS_P (regclass))
33611 return true;
33613 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
33615 /* Vector registers do not support QI or HImode loads. If we don't
33616 disallow a change to these modes, reload will assume it's ok to
33617 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
33618 the vec_dupv4hi pattern. */
33619 if (GET_MODE_SIZE (from) < 4)
33620 return true;
33622 /* Vector registers do not support subreg with nonzero offsets, which
33623 are otherwise valid for integer registers. Since we can't see
33624 whether we have a nonzero offset from here, prohibit all
33625 nonparadoxical subregs changing size. */
33626 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
33627 return true;
33630 return false;
33633 /* Return the cost of moving data of mode M between a
33634 register and memory. A value of 2 is the default; this cost is
33635 relative to those in `REGISTER_MOVE_COST'.
33637 This function is used extensively by register_move_cost that is used to
33638 build tables at startup. Make it inline in this case.
33639 When IN is 2, return maximum of in and out move cost.
33641 If moving between registers and memory is more expensive than
33642 between two registers, you should define this macro to express the
33643 relative cost.
33645 Model also increased moving costs of QImode registers in non
33646 Q_REGS classes.
33648 static inline int
33649 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
33650 int in)
33652 int cost;
33653 if (FLOAT_CLASS_P (regclass))
33655 int index;
33656 switch (mode)
33658 case SFmode:
33659 index = 0;
33660 break;
33661 case DFmode:
33662 index = 1;
33663 break;
33664 case XFmode:
33665 index = 2;
33666 break;
33667 default:
33668 return 100;
33670 if (in == 2)
33671 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
33672 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
33674 if (SSE_CLASS_P (regclass))
33676 int index;
33677 switch (GET_MODE_SIZE (mode))
33679 case 4:
33680 index = 0;
33681 break;
33682 case 8:
33683 index = 1;
33684 break;
33685 case 16:
33686 index = 2;
33687 break;
33688 default:
33689 return 100;
33691 if (in == 2)
33692 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
33693 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
33695 if (MMX_CLASS_P (regclass))
33697 int index;
33698 switch (GET_MODE_SIZE (mode))
33700 case 4:
33701 index = 0;
33702 break;
33703 case 8:
33704 index = 1;
33705 break;
33706 default:
33707 return 100;
33709 if (in)
33710 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
33711 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
33713 switch (GET_MODE_SIZE (mode))
33715 case 1:
33716 if (Q_CLASS_P (regclass) || TARGET_64BIT)
33718 if (!in)
33719 return ix86_cost->int_store[0];
33720 if (TARGET_PARTIAL_REG_DEPENDENCY
33721 && optimize_function_for_speed_p (cfun))
33722 cost = ix86_cost->movzbl_load;
33723 else
33724 cost = ix86_cost->int_load[0];
33725 if (in == 2)
33726 return MAX (cost, ix86_cost->int_store[0]);
33727 return cost;
33729 else
33731 if (in == 2)
33732 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
33733 if (in)
33734 return ix86_cost->movzbl_load;
33735 else
33736 return ix86_cost->int_store[0] + 4;
33738 break;
33739 case 2:
33740 if (in == 2)
33741 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
33742 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
33743 default:
33744 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
33745 if (mode == TFmode)
33746 mode = XFmode;
33747 if (in == 2)
33748 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
33749 else if (in)
33750 cost = ix86_cost->int_load[2];
33751 else
33752 cost = ix86_cost->int_store[2];
33753 return (cost * (((int) GET_MODE_SIZE (mode)
33754 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
33758 static int
33759 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
33760 bool in)
33762 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
33766 /* Return the cost of moving data from a register in class CLASS1 to
33767 one in class CLASS2.
33769 It is not required that the cost always equal 2 when FROM is the same as TO;
33770 on some machines it is expensive to move between registers if they are not
33771 general registers. */
33773 static int
33774 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
33775 reg_class_t class2_i)
33777 enum reg_class class1 = (enum reg_class) class1_i;
33778 enum reg_class class2 = (enum reg_class) class2_i;
33780 /* In case we require secondary memory, compute cost of the store followed
33781 by load. In order to avoid bad register allocation choices, we need
33782 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
33784 if (inline_secondary_memory_needed (class1, class2, mode, 0))
33786 int cost = 1;
33788 cost += inline_memory_move_cost (mode, class1, 2);
33789 cost += inline_memory_move_cost (mode, class2, 2);
33791 /* In case of copying from general_purpose_register we may emit multiple
33792 stores followed by single load causing memory size mismatch stall.
33793 Count this as arbitrarily high cost of 20. */
33794 if (targetm.class_max_nregs (class1, mode)
33795 > targetm.class_max_nregs (class2, mode))
33796 cost += 20;
33798 /* In the case of FP/MMX moves, the registers actually overlap, and we
33799 have to switch modes in order to treat them differently. */
33800 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
33801 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
33802 cost += 20;
33804 return cost;
33807 /* Moves between SSE/MMX and integer unit are expensive. */
33808 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
33809 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
33811 /* ??? By keeping returned value relatively high, we limit the number
33812 of moves between integer and MMX/SSE registers for all targets.
33813 Additionally, high value prevents problem with x86_modes_tieable_p(),
33814 where integer modes in MMX/SSE registers are not tieable
33815 because of missing QImode and HImode moves to, from or between
33816 MMX/SSE registers. */
33817 return MAX (8, ix86_cost->mmxsse_to_integer);
33819 if (MAYBE_FLOAT_CLASS_P (class1))
33820 return ix86_cost->fp_move;
33821 if (MAYBE_SSE_CLASS_P (class1))
33822 return ix86_cost->sse_move;
33823 if (MAYBE_MMX_CLASS_P (class1))
33824 return ix86_cost->mmx_move;
33825 return 2;
33828 /* Return TRUE if hard register REGNO can hold a value of machine-mode
33829 MODE. */
33831 bool
33832 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
33834 /* Flags and only flags can only hold CCmode values. */
33835 if (CC_REGNO_P (regno))
33836 return GET_MODE_CLASS (mode) == MODE_CC;
33837 if (GET_MODE_CLASS (mode) == MODE_CC
33838 || GET_MODE_CLASS (mode) == MODE_RANDOM
33839 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
33840 return false;
33841 if (STACK_REGNO_P (regno))
33842 return VALID_FP_MODE_P (mode);
33843 if (SSE_REGNO_P (regno))
33845 /* We implement the move patterns for all vector modes into and
33846 out of SSE registers, even when no operation instructions
33847 are available. OImode move is available only when AVX is
33848 enabled. */
33849 return ((TARGET_AVX && mode == OImode)
33850 || VALID_AVX256_REG_MODE (mode)
33851 || VALID_SSE_REG_MODE (mode)
33852 || VALID_SSE2_REG_MODE (mode)
33853 || VALID_MMX_REG_MODE (mode)
33854 || VALID_MMX_REG_MODE_3DNOW (mode));
33856 if (MMX_REGNO_P (regno))
33858 /* We implement the move patterns for 3DNOW modes even in MMX mode,
33859 so if the register is available at all, then we can move data of
33860 the given mode into or out of it. */
33861 return (VALID_MMX_REG_MODE (mode)
33862 || VALID_MMX_REG_MODE_3DNOW (mode));
33865 if (mode == QImode)
33867 /* Take care for QImode values - they can be in non-QI regs,
33868 but then they do cause partial register stalls. */
33869 if (TARGET_64BIT || QI_REGNO_P (regno))
33870 return true;
33871 if (!TARGET_PARTIAL_REG_STALL)
33872 return true;
33873 return !can_create_pseudo_p ();
33875 /* We handle both integer and floats in the general purpose registers. */
33876 else if (VALID_INT_MODE_P (mode))
33877 return true;
33878 else if (VALID_FP_MODE_P (mode))
33879 return true;
33880 else if (VALID_DFP_MODE_P (mode))
33881 return true;
33882 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
33883 on to use that value in smaller contexts, this can easily force a
33884 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
33885 supporting DImode, allow it. */
33886 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
33887 return true;
33889 return false;
33892 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
33893 tieable integer mode. */
33895 static bool
33896 ix86_tieable_integer_mode_p (enum machine_mode mode)
33898 switch (mode)
33900 case HImode:
33901 case SImode:
33902 return true;
33904 case QImode:
33905 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
33907 case DImode:
33908 return TARGET_64BIT;
33910 default:
33911 return false;
33915 /* Return true if MODE1 is accessible in a register that can hold MODE2
33916 without copying. That is, all register classes that can hold MODE2
33917 can also hold MODE1. */
33919 bool
33920 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
33922 if (mode1 == mode2)
33923 return true;
33925 if (ix86_tieable_integer_mode_p (mode1)
33926 && ix86_tieable_integer_mode_p (mode2))
33927 return true;
33929 /* MODE2 being XFmode implies fp stack or general regs, which means we
33930 can tie any smaller floating point modes to it. Note that we do not
33931 tie this with TFmode. */
33932 if (mode2 == XFmode)
33933 return mode1 == SFmode || mode1 == DFmode;
33935 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
33936 that we can tie it with SFmode. */
33937 if (mode2 == DFmode)
33938 return mode1 == SFmode;
33940 /* If MODE2 is only appropriate for an SSE register, then tie with
33941 any other mode acceptable to SSE registers. */
33942 if (GET_MODE_SIZE (mode2) == 32
33943 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
33944 return (GET_MODE_SIZE (mode1) == 32
33945 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
33946 if (GET_MODE_SIZE (mode2) == 16
33947 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
33948 return (GET_MODE_SIZE (mode1) == 16
33949 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
33951 /* If MODE2 is appropriate for an MMX register, then tie
33952 with any other mode acceptable to MMX registers. */
33953 if (GET_MODE_SIZE (mode2) == 8
33954 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
33955 return (GET_MODE_SIZE (mode1) == 8
33956 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
33958 return false;
33961 /* Return the cost of moving between two registers of mode MODE. */
33963 static int
33964 ix86_set_reg_reg_cost (enum machine_mode mode)
33966 unsigned int units = UNITS_PER_WORD;
33968 switch (GET_MODE_CLASS (mode))
33970 default:
33971 break;
33973 case MODE_CC:
33974 units = GET_MODE_SIZE (CCmode);
33975 break;
33977 case MODE_FLOAT:
33978 if ((TARGET_SSE && mode == TFmode)
33979 || (TARGET_80387 && mode == XFmode)
33980 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
33981 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
33982 units = GET_MODE_SIZE (mode);
33983 break;
33985 case MODE_COMPLEX_FLOAT:
33986 if ((TARGET_SSE && mode == TCmode)
33987 || (TARGET_80387 && mode == XCmode)
33988 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
33989 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
33990 units = GET_MODE_SIZE (mode);
33991 break;
33993 case MODE_VECTOR_INT:
33994 case MODE_VECTOR_FLOAT:
33995 if ((TARGET_AVX && VALID_AVX256_REG_MODE (mode))
33996 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
33997 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
33998 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
33999 units = GET_MODE_SIZE (mode);
34002 /* Return the cost of moving between two registers of mode MODE,
34003 assuming that the move will be in pieces of at most UNITS bytes. */
34004 return COSTS_N_INSNS ((GET_MODE_SIZE (mode) + units - 1) / units);
34007 /* Compute a (partial) cost for rtx X. Return true if the complete
34008 cost has been computed, and false if subexpressions should be
34009 scanned. In either case, *TOTAL contains the cost result. */
34011 static bool
34012 ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
34013 bool speed)
34015 enum rtx_code code = (enum rtx_code) code_i;
34016 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
34017 enum machine_mode mode = GET_MODE (x);
34018 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
34020 switch (code)
34022 case SET:
34023 if (register_operand (SET_DEST (x), VOIDmode)
34024 && reg_or_0_operand (SET_SRC (x), VOIDmode))
34026 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
34027 return true;
34029 return false;
34031 case CONST_INT:
34032 case CONST:
34033 case LABEL_REF:
34034 case SYMBOL_REF:
34035 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
34036 *total = 3;
34037 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
34038 *total = 2;
34039 else if (flag_pic && SYMBOLIC_CONST (x)
34040 && (!TARGET_64BIT
34041 || (!GET_CODE (x) != LABEL_REF
34042 && (GET_CODE (x) != SYMBOL_REF
34043 || !SYMBOL_REF_LOCAL_P (x)))))
34044 *total = 1;
34045 else
34046 *total = 0;
34047 return true;
34049 case CONST_DOUBLE:
34050 if (mode == VOIDmode)
34052 *total = 0;
34053 return true;
34055 switch (standard_80387_constant_p (x))
34057 case 1: /* 0.0 */
34058 *total = 1;
34059 return true;
34060 default: /* Other constants */
34061 *total = 2;
34062 return true;
34063 case 0:
34064 case -1:
34065 break;
34067 if (SSE_FLOAT_MODE_P (mode))
34069 case CONST_VECTOR:
34070 switch (standard_sse_constant_p (x))
34072 case 0:
34073 break;
34074 case 1: /* 0: xor eliminates false dependency */
34075 *total = 0;
34076 return true;
34077 default: /* -1: cmp contains false dependency */
34078 *total = 1;
34079 return true;
34082 /* Fall back to (MEM (SYMBOL_REF)), since that's where
34083 it'll probably end up. Add a penalty for size. */
34084 *total = (COSTS_N_INSNS (1)
34085 + (flag_pic != 0 && !TARGET_64BIT)
34086 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
34087 return true;
34089 case ZERO_EXTEND:
34090 /* The zero extensions is often completely free on x86_64, so make
34091 it as cheap as possible. */
34092 if (TARGET_64BIT && mode == DImode
34093 && GET_MODE (XEXP (x, 0)) == SImode)
34094 *total = 1;
34095 else if (TARGET_ZERO_EXTEND_WITH_AND)
34096 *total = cost->add;
34097 else
34098 *total = cost->movzx;
34099 return false;
34101 case SIGN_EXTEND:
34102 *total = cost->movsx;
34103 return false;
34105 case ASHIFT:
34106 if (SCALAR_INT_MODE_P (mode)
34107 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
34108 && CONST_INT_P (XEXP (x, 1)))
34110 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
34111 if (value == 1)
34113 *total = cost->add;
34114 return false;
34116 if ((value == 2 || value == 3)
34117 && cost->lea <= cost->shift_const)
34119 *total = cost->lea;
34120 return false;
34123 /* FALLTHRU */
34125 case ROTATE:
34126 case ASHIFTRT:
34127 case LSHIFTRT:
34128 case ROTATERT:
34129 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
34131 /* ??? Should be SSE vector operation cost. */
34132 /* At least for published AMD latencies, this really is the same
34133 as the latency for a simple fpu operation like fabs. */
34134 /* V*QImode is emulated with 1-11 insns. */
34135 if (mode == V16QImode || mode == V32QImode)
34137 int count = 11;
34138 if (TARGET_XOP && mode == V16QImode)
34140 /* For XOP we use vpshab, which requires a broadcast of the
34141 value to the variable shift insn. For constants this
34142 means a V16Q const in mem; even when we can perform the
34143 shift with one insn set the cost to prefer paddb. */
34144 if (CONSTANT_P (XEXP (x, 1)))
34146 *total = (cost->fabs
34147 + rtx_cost (XEXP (x, 0), code, 0, speed)
34148 + (speed ? 2 : COSTS_N_BYTES (16)));
34149 return true;
34151 count = 3;
34153 else if (TARGET_SSSE3)
34154 count = 7;
34155 *total = cost->fabs * count;
34157 else
34158 *total = cost->fabs;
34160 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34162 if (CONST_INT_P (XEXP (x, 1)))
34164 if (INTVAL (XEXP (x, 1)) > 32)
34165 *total = cost->shift_const + COSTS_N_INSNS (2);
34166 else
34167 *total = cost->shift_const * 2;
34169 else
34171 if (GET_CODE (XEXP (x, 1)) == AND)
34172 *total = cost->shift_var * 2;
34173 else
34174 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
34177 else
34179 if (CONST_INT_P (XEXP (x, 1)))
34180 *total = cost->shift_const;
34181 else
34182 *total = cost->shift_var;
34184 return false;
34186 case FMA:
34188 rtx sub;
34190 gcc_assert (FLOAT_MODE_P (mode));
34191 gcc_assert (TARGET_FMA || TARGET_FMA4);
34193 /* ??? SSE scalar/vector cost should be used here. */
34194 /* ??? Bald assumption that fma has the same cost as fmul. */
34195 *total = cost->fmul;
34196 *total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
34198 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
34199 sub = XEXP (x, 0);
34200 if (GET_CODE (sub) == NEG)
34201 sub = XEXP (sub, 0);
34202 *total += rtx_cost (sub, FMA, 0, speed);
34204 sub = XEXP (x, 2);
34205 if (GET_CODE (sub) == NEG)
34206 sub = XEXP (sub, 0);
34207 *total += rtx_cost (sub, FMA, 2, speed);
34208 return true;
34211 case MULT:
34212 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34214 /* ??? SSE scalar cost should be used here. */
34215 *total = cost->fmul;
34216 return false;
34218 else if (X87_FLOAT_MODE_P (mode))
34220 *total = cost->fmul;
34221 return false;
34223 else if (FLOAT_MODE_P (mode))
34225 /* ??? SSE vector cost should be used here. */
34226 *total = cost->fmul;
34227 return false;
34229 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
34231 /* V*QImode is emulated with 7-13 insns. */
34232 if (mode == V16QImode || mode == V32QImode)
34234 int extra = 11;
34235 if (TARGET_XOP && mode == V16QImode)
34236 extra = 5;
34237 else if (TARGET_SSSE3)
34238 extra = 6;
34239 *total = cost->fmul * 2 + cost->fabs * extra;
34241 /* V*DImode is emulated with 5-8 insns. */
34242 else if (mode == V2DImode || mode == V4DImode)
34244 if (TARGET_XOP && mode == V2DImode)
34245 *total = cost->fmul * 2 + cost->fabs * 3;
34246 else
34247 *total = cost->fmul * 3 + cost->fabs * 5;
34249 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
34250 insns, including two PMULUDQ. */
34251 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
34252 *total = cost->fmul * 2 + cost->fabs * 5;
34253 else
34254 *total = cost->fmul;
34255 return false;
34257 else
34259 rtx op0 = XEXP (x, 0);
34260 rtx op1 = XEXP (x, 1);
34261 int nbits;
34262 if (CONST_INT_P (XEXP (x, 1)))
34264 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
34265 for (nbits = 0; value != 0; value &= value - 1)
34266 nbits++;
34268 else
34269 /* This is arbitrary. */
34270 nbits = 7;
34272 /* Compute costs correctly for widening multiplication. */
34273 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
34274 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
34275 == GET_MODE_SIZE (mode))
34277 int is_mulwiden = 0;
34278 enum machine_mode inner_mode = GET_MODE (op0);
34280 if (GET_CODE (op0) == GET_CODE (op1))
34281 is_mulwiden = 1, op1 = XEXP (op1, 0);
34282 else if (CONST_INT_P (op1))
34284 if (GET_CODE (op0) == SIGN_EXTEND)
34285 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
34286 == INTVAL (op1);
34287 else
34288 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
34291 if (is_mulwiden)
34292 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
34295 *total = (cost->mult_init[MODE_INDEX (mode)]
34296 + nbits * cost->mult_bit
34297 + rtx_cost (op0, outer_code, opno, speed)
34298 + rtx_cost (op1, outer_code, opno, speed));
34300 return true;
34303 case DIV:
34304 case UDIV:
34305 case MOD:
34306 case UMOD:
34307 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34308 /* ??? SSE cost should be used here. */
34309 *total = cost->fdiv;
34310 else if (X87_FLOAT_MODE_P (mode))
34311 *total = cost->fdiv;
34312 else if (FLOAT_MODE_P (mode))
34313 /* ??? SSE vector cost should be used here. */
34314 *total = cost->fdiv;
34315 else
34316 *total = cost->divide[MODE_INDEX (mode)];
34317 return false;
34319 case PLUS:
34320 if (GET_MODE_CLASS (mode) == MODE_INT
34321 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
34323 if (GET_CODE (XEXP (x, 0)) == PLUS
34324 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
34325 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
34326 && CONSTANT_P (XEXP (x, 1)))
34328 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
34329 if (val == 2 || val == 4 || val == 8)
34331 *total = cost->lea;
34332 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
34333 outer_code, opno, speed);
34334 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
34335 outer_code, opno, speed);
34336 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
34337 return true;
34340 else if (GET_CODE (XEXP (x, 0)) == MULT
34341 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
34343 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
34344 if (val == 2 || val == 4 || val == 8)
34346 *total = cost->lea;
34347 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
34348 outer_code, opno, speed);
34349 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
34350 return true;
34353 else if (GET_CODE (XEXP (x, 0)) == PLUS)
34355 *total = cost->lea;
34356 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
34357 outer_code, opno, speed);
34358 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
34359 outer_code, opno, speed);
34360 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
34361 return true;
34364 /* FALLTHRU */
34366 case MINUS:
34367 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34369 /* ??? SSE cost should be used here. */
34370 *total = cost->fadd;
34371 return false;
34373 else if (X87_FLOAT_MODE_P (mode))
34375 *total = cost->fadd;
34376 return false;
34378 else if (FLOAT_MODE_P (mode))
34380 /* ??? SSE vector cost should be used here. */
34381 *total = cost->fadd;
34382 return false;
34384 /* FALLTHRU */
34386 case AND:
34387 case IOR:
34388 case XOR:
34389 if (GET_MODE_CLASS (mode) == MODE_INT
34390 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34392 *total = (cost->add * 2
34393 + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
34394 << (GET_MODE (XEXP (x, 0)) != DImode))
34395 + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
34396 << (GET_MODE (XEXP (x, 1)) != DImode)));
34397 return true;
34399 /* FALLTHRU */
34401 case NEG:
34402 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34404 /* ??? SSE cost should be used here. */
34405 *total = cost->fchs;
34406 return false;
34408 else if (X87_FLOAT_MODE_P (mode))
34410 *total = cost->fchs;
34411 return false;
34413 else if (FLOAT_MODE_P (mode))
34415 /* ??? SSE vector cost should be used here. */
34416 *total = cost->fchs;
34417 return false;
34419 /* FALLTHRU */
34421 case NOT:
34422 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
34424 /* ??? Should be SSE vector operation cost. */
34425 /* At least for published AMD latencies, this really is the same
34426 as the latency for a simple fpu operation like fabs. */
34427 *total = cost->fabs;
34429 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34430 *total = cost->add * 2;
34431 else
34432 *total = cost->add;
34433 return false;
34435 case COMPARE:
34436 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
34437 && XEXP (XEXP (x, 0), 1) == const1_rtx
34438 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
34439 && XEXP (x, 1) == const0_rtx)
34441 /* This kind of construct is implemented using test[bwl].
34442 Treat it as if we had an AND. */
34443 *total = (cost->add
34444 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
34445 + rtx_cost (const1_rtx, outer_code, opno, speed));
34446 return true;
34448 return false;
34450 case FLOAT_EXTEND:
34451 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
34452 *total = 0;
34453 return false;
34455 case ABS:
34456 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34457 /* ??? SSE cost should be used here. */
34458 *total = cost->fabs;
34459 else if (X87_FLOAT_MODE_P (mode))
34460 *total = cost->fabs;
34461 else if (FLOAT_MODE_P (mode))
34462 /* ??? SSE vector cost should be used here. */
34463 *total = cost->fabs;
34464 return false;
34466 case SQRT:
34467 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34468 /* ??? SSE cost should be used here. */
34469 *total = cost->fsqrt;
34470 else if (X87_FLOAT_MODE_P (mode))
34471 *total = cost->fsqrt;
34472 else if (FLOAT_MODE_P (mode))
34473 /* ??? SSE vector cost should be used here. */
34474 *total = cost->fsqrt;
34475 return false;
34477 case UNSPEC:
34478 if (XINT (x, 1) == UNSPEC_TP)
34479 *total = 0;
34480 return false;
34482 case VEC_SELECT:
34483 case VEC_CONCAT:
34484 case VEC_MERGE:
34485 case VEC_DUPLICATE:
34486 /* ??? Assume all of these vector manipulation patterns are
34487 recognizable. In which case they all pretty much have the
34488 same cost. */
34489 *total = cost->fabs;
34490 return true;
34492 default:
34493 return false;
34497 #if TARGET_MACHO
34499 static int current_machopic_label_num;
34501 /* Given a symbol name and its associated stub, write out the
34502 definition of the stub. */
34504 void
34505 machopic_output_stub (FILE *file, const char *symb, const char *stub)
34507 unsigned int length;
34508 char *binder_name, *symbol_name, lazy_ptr_name[32];
34509 int label = ++current_machopic_label_num;
34511 /* For 64-bit we shouldn't get here. */
34512 gcc_assert (!TARGET_64BIT);
34514 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
34515 symb = targetm.strip_name_encoding (symb);
34517 length = strlen (stub);
34518 binder_name = XALLOCAVEC (char, length + 32);
34519 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
34521 length = strlen (symb);
34522 symbol_name = XALLOCAVEC (char, length + 32);
34523 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
34525 sprintf (lazy_ptr_name, "L%d$lz", label);
34527 if (MACHOPIC_ATT_STUB)
34528 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
34529 else if (MACHOPIC_PURE)
34530 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
34531 else
34532 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
34534 fprintf (file, "%s:\n", stub);
34535 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
34537 if (MACHOPIC_ATT_STUB)
34539 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
34541 else if (MACHOPIC_PURE)
34543 /* PIC stub. */
34544 /* 25-byte PIC stub using "CALL get_pc_thunk". */
34545 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
34546 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
34547 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
34548 label, lazy_ptr_name, label);
34549 fprintf (file, "\tjmp\t*%%ecx\n");
34551 else
34552 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
34554 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
34555 it needs no stub-binding-helper. */
34556 if (MACHOPIC_ATT_STUB)
34557 return;
34559 fprintf (file, "%s:\n", binder_name);
34561 if (MACHOPIC_PURE)
34563 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
34564 fprintf (file, "\tpushl\t%%ecx\n");
34566 else
34567 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
34569 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
34571 /* N.B. Keep the correspondence of these
34572 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
34573 old-pic/new-pic/non-pic stubs; altering this will break
34574 compatibility with existing dylibs. */
34575 if (MACHOPIC_PURE)
34577 /* 25-byte PIC stub using "CALL get_pc_thunk". */
34578 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
34580 else
34581 /* 16-byte -mdynamic-no-pic stub. */
34582 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
34584 fprintf (file, "%s:\n", lazy_ptr_name);
34585 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
34586 fprintf (file, ASM_LONG "%s\n", binder_name);
34588 #endif /* TARGET_MACHO */
34590 /* Order the registers for register allocator. */
34592 void
34593 x86_order_regs_for_local_alloc (void)
34595 int pos = 0;
34596 int i;
34598 /* First allocate the local general purpose registers. */
34599 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
34600 if (GENERAL_REGNO_P (i) && call_used_regs[i])
34601 reg_alloc_order [pos++] = i;
34603 /* Global general purpose registers. */
34604 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
34605 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
34606 reg_alloc_order [pos++] = i;
34608 /* x87 registers come first in case we are doing FP math
34609 using them. */
34610 if (!TARGET_SSE_MATH)
34611 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
34612 reg_alloc_order [pos++] = i;
34614 /* SSE registers. */
34615 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
34616 reg_alloc_order [pos++] = i;
34617 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
34618 reg_alloc_order [pos++] = i;
34620 /* x87 registers. */
34621 if (TARGET_SSE_MATH)
34622 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
34623 reg_alloc_order [pos++] = i;
34625 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
34626 reg_alloc_order [pos++] = i;
34628 /* Initialize the rest of array as we do not allocate some registers
34629 at all. */
34630 while (pos < FIRST_PSEUDO_REGISTER)
34631 reg_alloc_order [pos++] = 0;
34634 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
34635 in struct attribute_spec handler. */
34636 static tree
34637 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
34638 tree args,
34639 int flags ATTRIBUTE_UNUSED,
34640 bool *no_add_attrs)
34642 if (TREE_CODE (*node) != FUNCTION_TYPE
34643 && TREE_CODE (*node) != METHOD_TYPE
34644 && TREE_CODE (*node) != FIELD_DECL
34645 && TREE_CODE (*node) != TYPE_DECL)
34647 warning (OPT_Wattributes, "%qE attribute only applies to functions",
34648 name);
34649 *no_add_attrs = true;
34650 return NULL_TREE;
34652 if (TARGET_64BIT)
34654 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
34655 name);
34656 *no_add_attrs = true;
34657 return NULL_TREE;
34659 if (is_attribute_p ("callee_pop_aggregate_return", name))
34661 tree cst;
34663 cst = TREE_VALUE (args);
34664 if (TREE_CODE (cst) != INTEGER_CST)
34666 warning (OPT_Wattributes,
34667 "%qE attribute requires an integer constant argument",
34668 name);
34669 *no_add_attrs = true;
34671 else if (compare_tree_int (cst, 0) != 0
34672 && compare_tree_int (cst, 1) != 0)
34674 warning (OPT_Wattributes,
34675 "argument to %qE attribute is neither zero, nor one",
34676 name);
34677 *no_add_attrs = true;
34680 return NULL_TREE;
34683 return NULL_TREE;
34686 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
34687 struct attribute_spec.handler. */
34688 static tree
34689 ix86_handle_abi_attribute (tree *node, tree name,
34690 tree args ATTRIBUTE_UNUSED,
34691 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
34693 if (TREE_CODE (*node) != FUNCTION_TYPE
34694 && TREE_CODE (*node) != METHOD_TYPE
34695 && TREE_CODE (*node) != FIELD_DECL
34696 && TREE_CODE (*node) != TYPE_DECL)
34698 warning (OPT_Wattributes, "%qE attribute only applies to functions",
34699 name);
34700 *no_add_attrs = true;
34701 return NULL_TREE;
34704 /* Can combine regparm with all attributes but fastcall. */
34705 if (is_attribute_p ("ms_abi", name))
34707 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
34709 error ("ms_abi and sysv_abi attributes are not compatible");
34712 return NULL_TREE;
34714 else if (is_attribute_p ("sysv_abi", name))
34716 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
34718 error ("ms_abi and sysv_abi attributes are not compatible");
34721 return NULL_TREE;
34724 return NULL_TREE;
34727 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
34728 struct attribute_spec.handler. */
34729 static tree
34730 ix86_handle_struct_attribute (tree *node, tree name,
34731 tree args ATTRIBUTE_UNUSED,
34732 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
34734 tree *type = NULL;
34735 if (DECL_P (*node))
34737 if (TREE_CODE (*node) == TYPE_DECL)
34738 type = &TREE_TYPE (*node);
34740 else
34741 type = node;
34743 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
34745 warning (OPT_Wattributes, "%qE attribute ignored",
34746 name);
34747 *no_add_attrs = true;
34750 else if ((is_attribute_p ("ms_struct", name)
34751 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
34752 || ((is_attribute_p ("gcc_struct", name)
34753 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
34755 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
34756 name);
34757 *no_add_attrs = true;
34760 return NULL_TREE;
34763 static tree
34764 ix86_handle_fndecl_attribute (tree *node, tree name,
34765 tree args ATTRIBUTE_UNUSED,
34766 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
34768 if (TREE_CODE (*node) != FUNCTION_DECL)
34770 warning (OPT_Wattributes, "%qE attribute only applies to functions",
34771 name);
34772 *no_add_attrs = true;
34774 return NULL_TREE;
34777 static bool
34778 ix86_ms_bitfield_layout_p (const_tree record_type)
34780 return ((TARGET_MS_BITFIELD_LAYOUT
34781 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
34782 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
34785 /* Returns an expression indicating where the this parameter is
34786 located on entry to the FUNCTION. */
34788 static rtx
34789 x86_this_parameter (tree function)
34791 tree type = TREE_TYPE (function);
34792 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
34793 int nregs;
34795 if (TARGET_64BIT)
34797 const int *parm_regs;
34799 if (ix86_function_type_abi (type) == MS_ABI)
34800 parm_regs = x86_64_ms_abi_int_parameter_registers;
34801 else
34802 parm_regs = x86_64_int_parameter_registers;
34803 return gen_rtx_REG (Pmode, parm_regs[aggr]);
34806 nregs = ix86_function_regparm (type, function);
34808 if (nregs > 0 && !stdarg_p (type))
34810 int regno;
34811 unsigned int ccvt = ix86_get_callcvt (type);
34813 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
34814 regno = aggr ? DX_REG : CX_REG;
34815 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
34817 regno = CX_REG;
34818 if (aggr)
34819 return gen_rtx_MEM (SImode,
34820 plus_constant (Pmode, stack_pointer_rtx, 4));
34822 else
34824 regno = AX_REG;
34825 if (aggr)
34827 regno = DX_REG;
34828 if (nregs == 1)
34829 return gen_rtx_MEM (SImode,
34830 plus_constant (Pmode,
34831 stack_pointer_rtx, 4));
34834 return gen_rtx_REG (SImode, regno);
34837 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
34838 aggr ? 8 : 4));
34841 /* Determine whether x86_output_mi_thunk can succeed. */
34843 static bool
34844 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
34845 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
34846 HOST_WIDE_INT vcall_offset, const_tree function)
34848 /* 64-bit can handle anything. */
34849 if (TARGET_64BIT)
34850 return true;
34852 /* For 32-bit, everything's fine if we have one free register. */
34853 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
34854 return true;
34856 /* Need a free register for vcall_offset. */
34857 if (vcall_offset)
34858 return false;
34860 /* Need a free register for GOT references. */
34861 if (flag_pic && !targetm.binds_local_p (function))
34862 return false;
34864 /* Otherwise ok. */
34865 return true;
34868 /* Output the assembler code for a thunk function. THUNK_DECL is the
34869 declaration for the thunk function itself, FUNCTION is the decl for
34870 the target function. DELTA is an immediate constant offset to be
34871 added to THIS. If VCALL_OFFSET is nonzero, the word at
34872 *(*this + vcall_offset) should be added to THIS. */
34874 static void
34875 x86_output_mi_thunk (FILE *file,
34876 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
34877 HOST_WIDE_INT vcall_offset, tree function)
34879 rtx this_param = x86_this_parameter (function);
34880 rtx this_reg, tmp, fnaddr;
34881 unsigned int tmp_regno;
34883 if (TARGET_64BIT)
34884 tmp_regno = R10_REG;
34885 else
34887 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
34888 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
34889 tmp_regno = AX_REG;
34890 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
34891 tmp_regno = DX_REG;
34892 else
34893 tmp_regno = CX_REG;
34896 emit_note (NOTE_INSN_PROLOGUE_END);
34898 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
34899 pull it in now and let DELTA benefit. */
34900 if (REG_P (this_param))
34901 this_reg = this_param;
34902 else if (vcall_offset)
34904 /* Put the this parameter into %eax. */
34905 this_reg = gen_rtx_REG (Pmode, AX_REG);
34906 emit_move_insn (this_reg, this_param);
34908 else
34909 this_reg = NULL_RTX;
34911 /* Adjust the this parameter by a fixed constant. */
34912 if (delta)
34914 rtx delta_rtx = GEN_INT (delta);
34915 rtx delta_dst = this_reg ? this_reg : this_param;
34917 if (TARGET_64BIT)
34919 if (!x86_64_general_operand (delta_rtx, Pmode))
34921 tmp = gen_rtx_REG (Pmode, tmp_regno);
34922 emit_move_insn (tmp, delta_rtx);
34923 delta_rtx = tmp;
34927 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
34930 /* Adjust the this parameter by a value stored in the vtable. */
34931 if (vcall_offset)
34933 rtx vcall_addr, vcall_mem, this_mem;
34935 tmp = gen_rtx_REG (Pmode, tmp_regno);
34937 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
34938 if (Pmode != ptr_mode)
34939 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
34940 emit_move_insn (tmp, this_mem);
34942 /* Adjust the this parameter. */
34943 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
34944 if (TARGET_64BIT
34945 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
34947 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
34948 emit_move_insn (tmp2, GEN_INT (vcall_offset));
34949 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
34952 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
34953 if (Pmode != ptr_mode)
34954 emit_insn (gen_addsi_1_zext (this_reg,
34955 gen_rtx_REG (ptr_mode,
34956 REGNO (this_reg)),
34957 vcall_mem));
34958 else
34959 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
34962 /* If necessary, drop THIS back to its stack slot. */
34963 if (this_reg && this_reg != this_param)
34964 emit_move_insn (this_param, this_reg);
34966 fnaddr = XEXP (DECL_RTL (function), 0);
34967 if (TARGET_64BIT)
34969 if (!flag_pic || targetm.binds_local_p (function)
34970 || cfun->machine->call_abi == MS_ABI)
34972 else
34974 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
34975 tmp = gen_rtx_CONST (Pmode, tmp);
34976 fnaddr = gen_rtx_MEM (Pmode, tmp);
34979 else
34981 if (!flag_pic || targetm.binds_local_p (function))
34983 #if TARGET_MACHO
34984 else if (TARGET_MACHO)
34986 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
34987 fnaddr = XEXP (fnaddr, 0);
34989 #endif /* TARGET_MACHO */
34990 else
34992 tmp = gen_rtx_REG (Pmode, CX_REG);
34993 output_set_got (tmp, NULL_RTX);
34995 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
34996 fnaddr = gen_rtx_PLUS (Pmode, fnaddr, tmp);
34997 fnaddr = gen_rtx_MEM (Pmode, fnaddr);
35001 /* Our sibling call patterns do not allow memories, because we have no
35002 predicate that can distinguish between frame and non-frame memory.
35003 For our purposes here, we can get away with (ab)using a jump pattern,
35004 because we're going to do no optimization. */
35005 if (MEM_P (fnaddr))
35006 emit_jump_insn (gen_indirect_jump (fnaddr));
35007 else
35009 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
35010 fnaddr = legitimize_pic_address (fnaddr,
35011 gen_rtx_REG (Pmode, tmp_regno));
35013 if (!sibcall_insn_operand (fnaddr, word_mode))
35015 tmp = gen_rtx_REG (word_mode, tmp_regno);
35016 if (GET_MODE (fnaddr) != word_mode)
35017 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
35018 emit_move_insn (tmp, fnaddr);
35019 fnaddr = tmp;
35022 tmp = gen_rtx_MEM (QImode, fnaddr);
35023 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
35024 tmp = emit_call_insn (tmp);
35025 SIBLING_CALL_P (tmp) = 1;
35027 emit_barrier ();
35029 /* Emit just enough of rest_of_compilation to get the insns emitted.
35030 Note that use_thunk calls assemble_start_function et al. */
35031 tmp = get_insns ();
35032 shorten_branches (tmp);
35033 final_start_function (tmp, file, 1);
35034 final (tmp, file, 1);
35035 final_end_function ();
35038 static void
35039 x86_file_start (void)
35041 default_file_start ();
35042 #if TARGET_MACHO
35043 darwin_file_start ();
35044 #endif
35045 if (X86_FILE_START_VERSION_DIRECTIVE)
35046 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
35047 if (X86_FILE_START_FLTUSED)
35048 fputs ("\t.global\t__fltused\n", asm_out_file);
35049 if (ix86_asm_dialect == ASM_INTEL)
35050 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
35054 x86_field_alignment (tree field, int computed)
35056 enum machine_mode mode;
35057 tree type = TREE_TYPE (field);
35059 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
35060 return computed;
35061 mode = TYPE_MODE (strip_array_types (type));
35062 if (mode == DFmode || mode == DCmode
35063 || GET_MODE_CLASS (mode) == MODE_INT
35064 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
35065 return MIN (32, computed);
35066 return computed;
35069 /* Output assembler code to FILE to increment profiler label # LABELNO
35070 for profiling a function entry. */
35071 void
35072 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
35074 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
35075 : MCOUNT_NAME);
35077 if (TARGET_64BIT)
35079 #ifndef NO_PROFILE_COUNTERS
35080 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
35081 #endif
35083 if (DEFAULT_ABI == SYSV_ABI && flag_pic)
35084 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
35085 else
35086 fprintf (file, "\tcall\t%s\n", mcount_name);
35088 else if (flag_pic)
35090 #ifndef NO_PROFILE_COUNTERS
35091 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
35092 LPREFIX, labelno);
35093 #endif
35094 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
35096 else
35098 #ifndef NO_PROFILE_COUNTERS
35099 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
35100 LPREFIX, labelno);
35101 #endif
35102 fprintf (file, "\tcall\t%s\n", mcount_name);
35106 /* We don't have exact information about the insn sizes, but we may assume
35107 quite safely that we are informed about all 1 byte insns and memory
35108 address sizes. This is enough to eliminate unnecessary padding in
35109 99% of cases. */
35111 static int
35112 min_insn_size (rtx insn)
35114 int l = 0, len;
35116 if (!INSN_P (insn) || !active_insn_p (insn))
35117 return 0;
35119 /* Discard alignments we've emit and jump instructions. */
35120 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
35121 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
35122 return 0;
35123 if (JUMP_TABLE_DATA_P (insn))
35124 return 0;
35126 /* Important case - calls are always 5 bytes.
35127 It is common to have many calls in the row. */
35128 if (CALL_P (insn)
35129 && symbolic_reference_mentioned_p (PATTERN (insn))
35130 && !SIBLING_CALL_P (insn))
35131 return 5;
35132 len = get_attr_length (insn);
35133 if (len <= 1)
35134 return 1;
35136 /* For normal instructions we rely on get_attr_length being exact,
35137 with a few exceptions. */
35138 if (!JUMP_P (insn))
35140 enum attr_type type = get_attr_type (insn);
35142 switch (type)
35144 case TYPE_MULTI:
35145 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
35146 || asm_noperands (PATTERN (insn)) >= 0)
35147 return 0;
35148 break;
35149 case TYPE_OTHER:
35150 case TYPE_FCMP:
35151 break;
35152 default:
35153 /* Otherwise trust get_attr_length. */
35154 return len;
35157 l = get_attr_length_address (insn);
35158 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
35159 l = 4;
35161 if (l)
35162 return 1+l;
35163 else
35164 return 2;
35167 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
35169 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
35170 window. */
35172 static void
35173 ix86_avoid_jump_mispredicts (void)
35175 rtx insn, start = get_insns ();
35176 int nbytes = 0, njumps = 0;
35177 int isjump = 0;
35179 /* Look for all minimal intervals of instructions containing 4 jumps.
35180 The intervals are bounded by START and INSN. NBYTES is the total
35181 size of instructions in the interval including INSN and not including
35182 START. When the NBYTES is smaller than 16 bytes, it is possible
35183 that the end of START and INSN ends up in the same 16byte page.
35185 The smallest offset in the page INSN can start is the case where START
35186 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
35187 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
35189 for (insn = start; insn; insn = NEXT_INSN (insn))
35191 int min_size;
35193 if (LABEL_P (insn))
35195 int align = label_to_alignment (insn);
35196 int max_skip = label_to_max_skip (insn);
35198 if (max_skip > 15)
35199 max_skip = 15;
35200 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
35201 already in the current 16 byte page, because otherwise
35202 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
35203 bytes to reach 16 byte boundary. */
35204 if (align <= 0
35205 || (align <= 3 && max_skip != (1 << align) - 1))
35206 max_skip = 0;
35207 if (dump_file)
35208 fprintf (dump_file, "Label %i with max_skip %i\n",
35209 INSN_UID (insn), max_skip);
35210 if (max_skip)
35212 while (nbytes + max_skip >= 16)
35214 start = NEXT_INSN (start);
35215 if ((JUMP_P (start)
35216 && GET_CODE (PATTERN (start)) != ADDR_VEC
35217 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
35218 || CALL_P (start))
35219 njumps--, isjump = 1;
35220 else
35221 isjump = 0;
35222 nbytes -= min_insn_size (start);
35225 continue;
35228 min_size = min_insn_size (insn);
35229 nbytes += min_size;
35230 if (dump_file)
35231 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
35232 INSN_UID (insn), min_size);
35233 if ((JUMP_P (insn)
35234 && GET_CODE (PATTERN (insn)) != ADDR_VEC
35235 && GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC)
35236 || CALL_P (insn))
35237 njumps++;
35238 else
35239 continue;
35241 while (njumps > 3)
35243 start = NEXT_INSN (start);
35244 if ((JUMP_P (start)
35245 && GET_CODE (PATTERN (start)) != ADDR_VEC
35246 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
35247 || CALL_P (start))
35248 njumps--, isjump = 1;
35249 else
35250 isjump = 0;
35251 nbytes -= min_insn_size (start);
35253 gcc_assert (njumps >= 0);
35254 if (dump_file)
35255 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
35256 INSN_UID (start), INSN_UID (insn), nbytes);
35258 if (njumps == 3 && isjump && nbytes < 16)
35260 int padsize = 15 - nbytes + min_insn_size (insn);
35262 if (dump_file)
35263 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
35264 INSN_UID (insn), padsize);
35265 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
35269 #endif
35271 /* AMD Athlon works faster
35272 when RET is not destination of conditional jump or directly preceded
35273 by other jump instruction. We avoid the penalty by inserting NOP just
35274 before the RET instructions in such cases. */
35275 static void
35276 ix86_pad_returns (void)
35278 edge e;
35279 edge_iterator ei;
35281 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
35283 basic_block bb = e->src;
35284 rtx ret = BB_END (bb);
35285 rtx prev;
35286 bool replace = false;
35288 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
35289 || optimize_bb_for_size_p (bb))
35290 continue;
35291 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
35292 if (active_insn_p (prev) || LABEL_P (prev))
35293 break;
35294 if (prev && LABEL_P (prev))
35296 edge e;
35297 edge_iterator ei;
35299 FOR_EACH_EDGE (e, ei, bb->preds)
35300 if (EDGE_FREQUENCY (e) && e->src->index >= 0
35301 && !(e->flags & EDGE_FALLTHRU))
35302 replace = true;
35304 if (!replace)
35306 prev = prev_active_insn (ret);
35307 if (prev
35308 && ((JUMP_P (prev) && any_condjump_p (prev))
35309 || CALL_P (prev)))
35310 replace = true;
35311 /* Empty functions get branch mispredict even when
35312 the jump destination is not visible to us. */
35313 if (!prev && !optimize_function_for_size_p (cfun))
35314 replace = true;
35316 if (replace)
35318 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
35319 delete_insn (ret);
35324 /* Count the minimum number of instructions in BB. Return 4 if the
35325 number of instructions >= 4. */
35327 static int
35328 ix86_count_insn_bb (basic_block bb)
35330 rtx insn;
35331 int insn_count = 0;
35333 /* Count number of instructions in this block. Return 4 if the number
35334 of instructions >= 4. */
35335 FOR_BB_INSNS (bb, insn)
35337 /* Only happen in exit blocks. */
35338 if (JUMP_P (insn)
35339 && ANY_RETURN_P (PATTERN (insn)))
35340 break;
35342 if (NONDEBUG_INSN_P (insn)
35343 && GET_CODE (PATTERN (insn)) != USE
35344 && GET_CODE (PATTERN (insn)) != CLOBBER)
35346 insn_count++;
35347 if (insn_count >= 4)
35348 return insn_count;
35352 return insn_count;
35356 /* Count the minimum number of instructions in code path in BB.
35357 Return 4 if the number of instructions >= 4. */
35359 static int
35360 ix86_count_insn (basic_block bb)
35362 edge e;
35363 edge_iterator ei;
35364 int min_prev_count;
35366 /* Only bother counting instructions along paths with no
35367 more than 2 basic blocks between entry and exit. Given
35368 that BB has an edge to exit, determine if a predecessor
35369 of BB has an edge from entry. If so, compute the number
35370 of instructions in the predecessor block. If there
35371 happen to be multiple such blocks, compute the minimum. */
35372 min_prev_count = 4;
35373 FOR_EACH_EDGE (e, ei, bb->preds)
35375 edge prev_e;
35376 edge_iterator prev_ei;
35378 if (e->src == ENTRY_BLOCK_PTR)
35380 min_prev_count = 0;
35381 break;
35383 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
35385 if (prev_e->src == ENTRY_BLOCK_PTR)
35387 int count = ix86_count_insn_bb (e->src);
35388 if (count < min_prev_count)
35389 min_prev_count = count;
35390 break;
35395 if (min_prev_count < 4)
35396 min_prev_count += ix86_count_insn_bb (bb);
35398 return min_prev_count;
35401 /* Pad short function to 4 instructions. */
35403 static void
35404 ix86_pad_short_function (void)
35406 edge e;
35407 edge_iterator ei;
35409 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
35411 rtx ret = BB_END (e->src);
35412 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
35414 int insn_count = ix86_count_insn (e->src);
35416 /* Pad short function. */
35417 if (insn_count < 4)
35419 rtx insn = ret;
35421 /* Find epilogue. */
35422 while (insn
35423 && (!NOTE_P (insn)
35424 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
35425 insn = PREV_INSN (insn);
35427 if (!insn)
35428 insn = ret;
35430 /* Two NOPs count as one instruction. */
35431 insn_count = 2 * (4 - insn_count);
35432 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
35438 /* Implement machine specific optimizations. We implement padding of returns
35439 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
35440 static void
35441 ix86_reorg (void)
35443 /* We are freeing block_for_insn in the toplev to keep compatibility
35444 with old MDEP_REORGS that are not CFG based. Recompute it now. */
35445 compute_bb_for_insn ();
35447 if (optimize && optimize_function_for_speed_p (cfun))
35449 if (TARGET_PAD_SHORT_FUNCTION)
35450 ix86_pad_short_function ();
35451 else if (TARGET_PAD_RETURNS)
35452 ix86_pad_returns ();
35453 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
35454 if (TARGET_FOUR_JUMP_LIMIT)
35455 ix86_avoid_jump_mispredicts ();
35456 #endif
35460 /* Return nonzero when QImode register that must be represented via REX prefix
35461 is used. */
35462 bool
35463 x86_extended_QIreg_mentioned_p (rtx insn)
35465 int i;
35466 extract_insn_cached (insn);
35467 for (i = 0; i < recog_data.n_operands; i++)
35468 if (GENERAL_REG_P (recog_data.operand[i])
35469 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
35470 return true;
35471 return false;
35474 /* Return nonzero when P points to register encoded via REX prefix.
35475 Called via for_each_rtx. */
35476 static int
35477 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
35479 unsigned int regno;
35480 if (!REG_P (*p))
35481 return 0;
35482 regno = REGNO (*p);
35483 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
35486 /* Return true when INSN mentions register that must be encoded using REX
35487 prefix. */
35488 bool
35489 x86_extended_reg_mentioned_p (rtx insn)
35491 return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
35492 extended_reg_mentioned_1, NULL);
35495 /* If profitable, negate (without causing overflow) integer constant
35496 of mode MODE at location LOC. Return true in this case. */
35497 bool
35498 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
35500 HOST_WIDE_INT val;
35502 if (!CONST_INT_P (*loc))
35503 return false;
35505 switch (mode)
35507 case DImode:
35508 /* DImode x86_64 constants must fit in 32 bits. */
35509 gcc_assert (x86_64_immediate_operand (*loc, mode));
35511 mode = SImode;
35512 break;
35514 case SImode:
35515 case HImode:
35516 case QImode:
35517 break;
35519 default:
35520 gcc_unreachable ();
35523 /* Avoid overflows. */
35524 if (mode_signbit_p (mode, *loc))
35525 return false;
35527 val = INTVAL (*loc);
35529 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
35530 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
35531 if ((val < 0 && val != -128)
35532 || val == 128)
35534 *loc = GEN_INT (-val);
35535 return true;
35538 return false;
35541 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
35542 optabs would emit if we didn't have TFmode patterns. */
35544 void
35545 x86_emit_floatuns (rtx operands[2])
35547 rtx neglab, donelab, i0, i1, f0, in, out;
35548 enum machine_mode mode, inmode;
35550 inmode = GET_MODE (operands[1]);
35551 gcc_assert (inmode == SImode || inmode == DImode);
35553 out = operands[0];
35554 in = force_reg (inmode, operands[1]);
35555 mode = GET_MODE (out);
35556 neglab = gen_label_rtx ();
35557 donelab = gen_label_rtx ();
35558 f0 = gen_reg_rtx (mode);
35560 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
35562 expand_float (out, in, 0);
35564 emit_jump_insn (gen_jump (donelab));
35565 emit_barrier ();
35567 emit_label (neglab);
35569 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
35570 1, OPTAB_DIRECT);
35571 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
35572 1, OPTAB_DIRECT);
35573 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
35575 expand_float (f0, i0, 0);
35577 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
35579 emit_label (donelab);
35582 /* AVX2 does support 32-byte integer vector operations,
35583 thus the longest vector we are faced with is V32QImode. */
35584 #define MAX_VECT_LEN 32
35586 struct expand_vec_perm_d
35588 rtx target, op0, op1;
35589 unsigned char perm[MAX_VECT_LEN];
35590 enum machine_mode vmode;
35591 unsigned char nelt;
35592 bool one_operand_p;
35593 bool testing_p;
35596 static bool canonicalize_perm (struct expand_vec_perm_d *d);
35597 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
35598 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
35600 /* Get a vector mode of the same size as the original but with elements
35601 twice as wide. This is only guaranteed to apply to integral vectors. */
35603 static inline enum machine_mode
35604 get_mode_wider_vector (enum machine_mode o)
35606 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
35607 enum machine_mode n = GET_MODE_WIDER_MODE (o);
35608 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
35609 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
35610 return n;
35613 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
35614 with all elements equal to VAR. Return true if successful. */
35616 static bool
35617 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
35618 rtx target, rtx val)
35620 bool ok;
35622 switch (mode)
35624 case V2SImode:
35625 case V2SFmode:
35626 if (!mmx_ok)
35627 return false;
35628 /* FALLTHRU */
35630 case V4DFmode:
35631 case V4DImode:
35632 case V8SFmode:
35633 case V8SImode:
35634 case V2DFmode:
35635 case V2DImode:
35636 case V4SFmode:
35637 case V4SImode:
35639 rtx insn, dup;
35641 /* First attempt to recognize VAL as-is. */
35642 dup = gen_rtx_VEC_DUPLICATE (mode, val);
35643 insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
35644 if (recog_memoized (insn) < 0)
35646 rtx seq;
35647 /* If that fails, force VAL into a register. */
35649 start_sequence ();
35650 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
35651 seq = get_insns ();
35652 end_sequence ();
35653 if (seq)
35654 emit_insn_before (seq, insn);
35656 ok = recog_memoized (insn) >= 0;
35657 gcc_assert (ok);
35660 return true;
35662 case V4HImode:
35663 if (!mmx_ok)
35664 return false;
35665 if (TARGET_SSE || TARGET_3DNOW_A)
35667 rtx x;
35669 val = gen_lowpart (SImode, val);
35670 x = gen_rtx_TRUNCATE (HImode, val);
35671 x = gen_rtx_VEC_DUPLICATE (mode, x);
35672 emit_insn (gen_rtx_SET (VOIDmode, target, x));
35673 return true;
35675 goto widen;
35677 case V8QImode:
35678 if (!mmx_ok)
35679 return false;
35680 goto widen;
35682 case V8HImode:
35683 if (TARGET_SSE2)
35685 struct expand_vec_perm_d dperm;
35686 rtx tmp1, tmp2;
35688 permute:
35689 memset (&dperm, 0, sizeof (dperm));
35690 dperm.target = target;
35691 dperm.vmode = mode;
35692 dperm.nelt = GET_MODE_NUNITS (mode);
35693 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
35694 dperm.one_operand_p = true;
35696 /* Extend to SImode using a paradoxical SUBREG. */
35697 tmp1 = gen_reg_rtx (SImode);
35698 emit_move_insn (tmp1, gen_lowpart (SImode, val));
35700 /* Insert the SImode value as low element of a V4SImode vector. */
35701 tmp2 = gen_lowpart (V4SImode, dperm.op0);
35702 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
35704 ok = (expand_vec_perm_1 (&dperm)
35705 || expand_vec_perm_broadcast_1 (&dperm));
35706 gcc_assert (ok);
35707 return ok;
35709 goto widen;
35711 case V16QImode:
35712 if (TARGET_SSE2)
35713 goto permute;
35714 goto widen;
35716 widen:
35717 /* Replicate the value once into the next wider mode and recurse. */
35719 enum machine_mode smode, wsmode, wvmode;
35720 rtx x;
35722 smode = GET_MODE_INNER (mode);
35723 wvmode = get_mode_wider_vector (mode);
35724 wsmode = GET_MODE_INNER (wvmode);
35726 val = convert_modes (wsmode, smode, val, true);
35727 x = expand_simple_binop (wsmode, ASHIFT, val,
35728 GEN_INT (GET_MODE_BITSIZE (smode)),
35729 NULL_RTX, 1, OPTAB_LIB_WIDEN);
35730 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
35732 x = gen_lowpart (wvmode, target);
35733 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
35734 gcc_assert (ok);
35735 return ok;
35738 case V16HImode:
35739 case V32QImode:
35741 enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
35742 rtx x = gen_reg_rtx (hvmode);
35744 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
35745 gcc_assert (ok);
35747 x = gen_rtx_VEC_CONCAT (mode, x, x);
35748 emit_insn (gen_rtx_SET (VOIDmode, target, x));
35750 return true;
35752 default:
35753 return false;
35757 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
35758 whose ONE_VAR element is VAR, and other elements are zero. Return true
35759 if successful. */
35761 static bool
35762 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
35763 rtx target, rtx var, int one_var)
35765 enum machine_mode vsimode;
35766 rtx new_target;
35767 rtx x, tmp;
35768 bool use_vector_set = false;
35770 switch (mode)
35772 case V2DImode:
35773 /* For SSE4.1, we normally use vector set. But if the second
35774 element is zero and inter-unit moves are OK, we use movq
35775 instead. */
35776 use_vector_set = (TARGET_64BIT
35777 && TARGET_SSE4_1
35778 && !(TARGET_INTER_UNIT_MOVES
35779 && one_var == 0));
35780 break;
35781 case V16QImode:
35782 case V4SImode:
35783 case V4SFmode:
35784 use_vector_set = TARGET_SSE4_1;
35785 break;
35786 case V8HImode:
35787 use_vector_set = TARGET_SSE2;
35788 break;
35789 case V4HImode:
35790 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
35791 break;
35792 case V32QImode:
35793 case V16HImode:
35794 case V8SImode:
35795 case V8SFmode:
35796 case V4DFmode:
35797 use_vector_set = TARGET_AVX;
35798 break;
35799 case V4DImode:
35800 /* Use ix86_expand_vector_set in 64bit mode only. */
35801 use_vector_set = TARGET_AVX && TARGET_64BIT;
35802 break;
35803 default:
35804 break;
35807 if (use_vector_set)
35809 emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
35810 var = force_reg (GET_MODE_INNER (mode), var);
35811 ix86_expand_vector_set (mmx_ok, target, var, one_var);
35812 return true;
35815 switch (mode)
35817 case V2SFmode:
35818 case V2SImode:
35819 if (!mmx_ok)
35820 return false;
35821 /* FALLTHRU */
35823 case V2DFmode:
35824 case V2DImode:
35825 if (one_var != 0)
35826 return false;
35827 var = force_reg (GET_MODE_INNER (mode), var);
35828 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
35829 emit_insn (gen_rtx_SET (VOIDmode, target, x));
35830 return true;
35832 case V4SFmode:
35833 case V4SImode:
35834 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
35835 new_target = gen_reg_rtx (mode);
35836 else
35837 new_target = target;
35838 var = force_reg (GET_MODE_INNER (mode), var);
35839 x = gen_rtx_VEC_DUPLICATE (mode, var);
35840 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
35841 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
35842 if (one_var != 0)
35844 /* We need to shuffle the value to the correct position, so
35845 create a new pseudo to store the intermediate result. */
35847 /* With SSE2, we can use the integer shuffle insns. */
35848 if (mode != V4SFmode && TARGET_SSE2)
35850 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
35851 const1_rtx,
35852 GEN_INT (one_var == 1 ? 0 : 1),
35853 GEN_INT (one_var == 2 ? 0 : 1),
35854 GEN_INT (one_var == 3 ? 0 : 1)));
35855 if (target != new_target)
35856 emit_move_insn (target, new_target);
35857 return true;
35860 /* Otherwise convert the intermediate result to V4SFmode and
35861 use the SSE1 shuffle instructions. */
35862 if (mode != V4SFmode)
35864 tmp = gen_reg_rtx (V4SFmode);
35865 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
35867 else
35868 tmp = new_target;
35870 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
35871 const1_rtx,
35872 GEN_INT (one_var == 1 ? 0 : 1),
35873 GEN_INT (one_var == 2 ? 0+4 : 1+4),
35874 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
35876 if (mode != V4SFmode)
35877 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
35878 else if (tmp != target)
35879 emit_move_insn (target, tmp);
35881 else if (target != new_target)
35882 emit_move_insn (target, new_target);
35883 return true;
35885 case V8HImode:
35886 case V16QImode:
35887 vsimode = V4SImode;
35888 goto widen;
35889 case V4HImode:
35890 case V8QImode:
35891 if (!mmx_ok)
35892 return false;
35893 vsimode = V2SImode;
35894 goto widen;
35895 widen:
35896 if (one_var != 0)
35897 return false;
35899 /* Zero extend the variable element to SImode and recurse. */
35900 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
35902 x = gen_reg_rtx (vsimode);
35903 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
35904 var, one_var))
35905 gcc_unreachable ();
35907 emit_move_insn (target, gen_lowpart (mode, x));
35908 return true;
35910 default:
35911 return false;
35915 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
35916 consisting of the values in VALS. It is known that all elements
35917 except ONE_VAR are constants. Return true if successful. */
35919 static bool
35920 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
35921 rtx target, rtx vals, int one_var)
35923 rtx var = XVECEXP (vals, 0, one_var);
35924 enum machine_mode wmode;
35925 rtx const_vec, x;
35927 const_vec = copy_rtx (vals);
35928 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
35929 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
35931 switch (mode)
35933 case V2DFmode:
35934 case V2DImode:
35935 case V2SFmode:
35936 case V2SImode:
35937 /* For the two element vectors, it's just as easy to use
35938 the general case. */
35939 return false;
35941 case V4DImode:
35942 /* Use ix86_expand_vector_set in 64bit mode only. */
35943 if (!TARGET_64BIT)
35944 return false;
35945 case V4DFmode:
35946 case V8SFmode:
35947 case V8SImode:
35948 case V16HImode:
35949 case V32QImode:
35950 case V4SFmode:
35951 case V4SImode:
35952 case V8HImode:
35953 case V4HImode:
35954 break;
35956 case V16QImode:
35957 if (TARGET_SSE4_1)
35958 break;
35959 wmode = V8HImode;
35960 goto widen;
35961 case V8QImode:
35962 wmode = V4HImode;
35963 goto widen;
35964 widen:
35965 /* There's no way to set one QImode entry easily. Combine
35966 the variable value with its adjacent constant value, and
35967 promote to an HImode set. */
35968 x = XVECEXP (vals, 0, one_var ^ 1);
35969 if (one_var & 1)
35971 var = convert_modes (HImode, QImode, var, true);
35972 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
35973 NULL_RTX, 1, OPTAB_LIB_WIDEN);
35974 x = GEN_INT (INTVAL (x) & 0xff);
35976 else
35978 var = convert_modes (HImode, QImode, var, true);
35979 x = gen_int_mode (INTVAL (x) << 8, HImode);
35981 if (x != const0_rtx)
35982 var = expand_simple_binop (HImode, IOR, var, x, var,
35983 1, OPTAB_LIB_WIDEN);
35985 x = gen_reg_rtx (wmode);
35986 emit_move_insn (x, gen_lowpart (wmode, const_vec));
35987 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
35989 emit_move_insn (target, gen_lowpart (mode, x));
35990 return true;
35992 default:
35993 return false;
35996 emit_move_insn (target, const_vec);
35997 ix86_expand_vector_set (mmx_ok, target, var, one_var);
35998 return true;
36001 /* A subroutine of ix86_expand_vector_init_general. Use vector
36002 concatenate to handle the most general case: all values variable,
36003 and none identical. */
36005 static void
36006 ix86_expand_vector_init_concat (enum machine_mode mode,
36007 rtx target, rtx *ops, int n)
36009 enum machine_mode cmode, hmode = VOIDmode;
36010 rtx first[8], second[4];
36011 rtvec v;
36012 int i, j;
36014 switch (n)
36016 case 2:
36017 switch (mode)
36019 case V8SImode:
36020 cmode = V4SImode;
36021 break;
36022 case V8SFmode:
36023 cmode = V4SFmode;
36024 break;
36025 case V4DImode:
36026 cmode = V2DImode;
36027 break;
36028 case V4DFmode:
36029 cmode = V2DFmode;
36030 break;
36031 case V4SImode:
36032 cmode = V2SImode;
36033 break;
36034 case V4SFmode:
36035 cmode = V2SFmode;
36036 break;
36037 case V2DImode:
36038 cmode = DImode;
36039 break;
36040 case V2SImode:
36041 cmode = SImode;
36042 break;
36043 case V2DFmode:
36044 cmode = DFmode;
36045 break;
36046 case V2SFmode:
36047 cmode = SFmode;
36048 break;
36049 default:
36050 gcc_unreachable ();
36053 if (!register_operand (ops[1], cmode))
36054 ops[1] = force_reg (cmode, ops[1]);
36055 if (!register_operand (ops[0], cmode))
36056 ops[0] = force_reg (cmode, ops[0]);
36057 emit_insn (gen_rtx_SET (VOIDmode, target,
36058 gen_rtx_VEC_CONCAT (mode, ops[0],
36059 ops[1])));
36060 break;
36062 case 4:
36063 switch (mode)
36065 case V4DImode:
36066 cmode = V2DImode;
36067 break;
36068 case V4DFmode:
36069 cmode = V2DFmode;
36070 break;
36071 case V4SImode:
36072 cmode = V2SImode;
36073 break;
36074 case V4SFmode:
36075 cmode = V2SFmode;
36076 break;
36077 default:
36078 gcc_unreachable ();
36080 goto half;
36082 case 8:
36083 switch (mode)
36085 case V8SImode:
36086 cmode = V2SImode;
36087 hmode = V4SImode;
36088 break;
36089 case V8SFmode:
36090 cmode = V2SFmode;
36091 hmode = V4SFmode;
36092 break;
36093 default:
36094 gcc_unreachable ();
36096 goto half;
36098 half:
36099 /* FIXME: We process inputs backward to help RA. PR 36222. */
36100 i = n - 1;
36101 j = (n >> 1) - 1;
36102 for (; i > 0; i -= 2, j--)
36104 first[j] = gen_reg_rtx (cmode);
36105 v = gen_rtvec (2, ops[i - 1], ops[i]);
36106 ix86_expand_vector_init (false, first[j],
36107 gen_rtx_PARALLEL (cmode, v));
36110 n >>= 1;
36111 if (n > 2)
36113 gcc_assert (hmode != VOIDmode);
36114 for (i = j = 0; i < n; i += 2, j++)
36116 second[j] = gen_reg_rtx (hmode);
36117 ix86_expand_vector_init_concat (hmode, second [j],
36118 &first [i], 2);
36120 n >>= 1;
36121 ix86_expand_vector_init_concat (mode, target, second, n);
36123 else
36124 ix86_expand_vector_init_concat (mode, target, first, n);
36125 break;
36127 default:
36128 gcc_unreachable ();
36132 /* A subroutine of ix86_expand_vector_init_general. Use vector
36133 interleave to handle the most general case: all values variable,
36134 and none identical. */
36136 static void
36137 ix86_expand_vector_init_interleave (enum machine_mode mode,
36138 rtx target, rtx *ops, int n)
36140 enum machine_mode first_imode, second_imode, third_imode, inner_mode;
36141 int i, j;
36142 rtx op0, op1;
36143 rtx (*gen_load_even) (rtx, rtx, rtx);
36144 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
36145 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
36147 switch (mode)
36149 case V8HImode:
36150 gen_load_even = gen_vec_setv8hi;
36151 gen_interleave_first_low = gen_vec_interleave_lowv4si;
36152 gen_interleave_second_low = gen_vec_interleave_lowv2di;
36153 inner_mode = HImode;
36154 first_imode = V4SImode;
36155 second_imode = V2DImode;
36156 third_imode = VOIDmode;
36157 break;
36158 case V16QImode:
36159 gen_load_even = gen_vec_setv16qi;
36160 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
36161 gen_interleave_second_low = gen_vec_interleave_lowv4si;
36162 inner_mode = QImode;
36163 first_imode = V8HImode;
36164 second_imode = V4SImode;
36165 third_imode = V2DImode;
36166 break;
36167 default:
36168 gcc_unreachable ();
36171 for (i = 0; i < n; i++)
36173 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
36174 op0 = gen_reg_rtx (SImode);
36175 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
36177 /* Insert the SImode value as low element of V4SImode vector. */
36178 op1 = gen_reg_rtx (V4SImode);
36179 op0 = gen_rtx_VEC_MERGE (V4SImode,
36180 gen_rtx_VEC_DUPLICATE (V4SImode,
36181 op0),
36182 CONST0_RTX (V4SImode),
36183 const1_rtx);
36184 emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
36186 /* Cast the V4SImode vector back to a vector in orignal mode. */
36187 op0 = gen_reg_rtx (mode);
36188 emit_move_insn (op0, gen_lowpart (mode, op1));
36190 /* Load even elements into the second positon. */
36191 emit_insn (gen_load_even (op0,
36192 force_reg (inner_mode,
36193 ops [i + i + 1]),
36194 const1_rtx));
36196 /* Cast vector to FIRST_IMODE vector. */
36197 ops[i] = gen_reg_rtx (first_imode);
36198 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
36201 /* Interleave low FIRST_IMODE vectors. */
36202 for (i = j = 0; i < n; i += 2, j++)
36204 op0 = gen_reg_rtx (first_imode);
36205 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
36207 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
36208 ops[j] = gen_reg_rtx (second_imode);
36209 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
36212 /* Interleave low SECOND_IMODE vectors. */
36213 switch (second_imode)
36215 case V4SImode:
36216 for (i = j = 0; i < n / 2; i += 2, j++)
36218 op0 = gen_reg_rtx (second_imode);
36219 emit_insn (gen_interleave_second_low (op0, ops[i],
36220 ops[i + 1]));
36222 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
36223 vector. */
36224 ops[j] = gen_reg_rtx (third_imode);
36225 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
36227 second_imode = V2DImode;
36228 gen_interleave_second_low = gen_vec_interleave_lowv2di;
36229 /* FALLTHRU */
36231 case V2DImode:
36232 op0 = gen_reg_rtx (second_imode);
36233 emit_insn (gen_interleave_second_low (op0, ops[0],
36234 ops[1]));
36236 /* Cast the SECOND_IMODE vector back to a vector on original
36237 mode. */
36238 emit_insn (gen_rtx_SET (VOIDmode, target,
36239 gen_lowpart (mode, op0)));
36240 break;
36242 default:
36243 gcc_unreachable ();
36247 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
36248 all values variable, and none identical. */
36250 static void
36251 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
36252 rtx target, rtx vals)
36254 rtx ops[32], op0, op1;
36255 enum machine_mode half_mode = VOIDmode;
36256 int n, i;
36258 switch (mode)
36260 case V2SFmode:
36261 case V2SImode:
36262 if (!mmx_ok && !TARGET_SSE)
36263 break;
36264 /* FALLTHRU */
36266 case V8SFmode:
36267 case V8SImode:
36268 case V4DFmode:
36269 case V4DImode:
36270 case V4SFmode:
36271 case V4SImode:
36272 case V2DFmode:
36273 case V2DImode:
36274 n = GET_MODE_NUNITS (mode);
36275 for (i = 0; i < n; i++)
36276 ops[i] = XVECEXP (vals, 0, i);
36277 ix86_expand_vector_init_concat (mode, target, ops, n);
36278 return;
36280 case V32QImode:
36281 half_mode = V16QImode;
36282 goto half;
36284 case V16HImode:
36285 half_mode = V8HImode;
36286 goto half;
36288 half:
36289 n = GET_MODE_NUNITS (mode);
36290 for (i = 0; i < n; i++)
36291 ops[i] = XVECEXP (vals, 0, i);
36292 op0 = gen_reg_rtx (half_mode);
36293 op1 = gen_reg_rtx (half_mode);
36294 ix86_expand_vector_init_interleave (half_mode, op0, ops,
36295 n >> 2);
36296 ix86_expand_vector_init_interleave (half_mode, op1,
36297 &ops [n >> 1], n >> 2);
36298 emit_insn (gen_rtx_SET (VOIDmode, target,
36299 gen_rtx_VEC_CONCAT (mode, op0, op1)));
36300 return;
36302 case V16QImode:
36303 if (!TARGET_SSE4_1)
36304 break;
36305 /* FALLTHRU */
36307 case V8HImode:
36308 if (!TARGET_SSE2)
36309 break;
36311 /* Don't use ix86_expand_vector_init_interleave if we can't
36312 move from GPR to SSE register directly. */
36313 if (!TARGET_INTER_UNIT_MOVES)
36314 break;
36316 n = GET_MODE_NUNITS (mode);
36317 for (i = 0; i < n; i++)
36318 ops[i] = XVECEXP (vals, 0, i);
36319 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
36320 return;
36322 case V4HImode:
36323 case V8QImode:
36324 break;
36326 default:
36327 gcc_unreachable ();
36331 int i, j, n_elts, n_words, n_elt_per_word;
36332 enum machine_mode inner_mode;
36333 rtx words[4], shift;
36335 inner_mode = GET_MODE_INNER (mode);
36336 n_elts = GET_MODE_NUNITS (mode);
36337 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
36338 n_elt_per_word = n_elts / n_words;
36339 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
36341 for (i = 0; i < n_words; ++i)
36343 rtx word = NULL_RTX;
36345 for (j = 0; j < n_elt_per_word; ++j)
36347 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
36348 elt = convert_modes (word_mode, inner_mode, elt, true);
36350 if (j == 0)
36351 word = elt;
36352 else
36354 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
36355 word, 1, OPTAB_LIB_WIDEN);
36356 word = expand_simple_binop (word_mode, IOR, word, elt,
36357 word, 1, OPTAB_LIB_WIDEN);
36361 words[i] = word;
36364 if (n_words == 1)
36365 emit_move_insn (target, gen_lowpart (mode, words[0]));
36366 else if (n_words == 2)
36368 rtx tmp = gen_reg_rtx (mode);
36369 emit_clobber (tmp);
36370 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
36371 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
36372 emit_move_insn (target, tmp);
36374 else if (n_words == 4)
36376 rtx tmp = gen_reg_rtx (V4SImode);
36377 gcc_assert (word_mode == SImode);
36378 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
36379 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
36380 emit_move_insn (target, gen_lowpart (mode, tmp));
36382 else
36383 gcc_unreachable ();
36387 /* Initialize vector TARGET via VALS. Suppress the use of MMX
36388 instructions unless MMX_OK is true. */
36390 void
36391 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
36393 enum machine_mode mode = GET_MODE (target);
36394 enum machine_mode inner_mode = GET_MODE_INNER (mode);
36395 int n_elts = GET_MODE_NUNITS (mode);
36396 int n_var = 0, one_var = -1;
36397 bool all_same = true, all_const_zero = true;
36398 int i;
36399 rtx x;
36401 for (i = 0; i < n_elts; ++i)
36403 x = XVECEXP (vals, 0, i);
36404 if (!(CONST_INT_P (x)
36405 || GET_CODE (x) == CONST_DOUBLE
36406 || GET_CODE (x) == CONST_FIXED))
36407 n_var++, one_var = i;
36408 else if (x != CONST0_RTX (inner_mode))
36409 all_const_zero = false;
36410 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
36411 all_same = false;
36414 /* Constants are best loaded from the constant pool. */
36415 if (n_var == 0)
36417 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
36418 return;
36421 /* If all values are identical, broadcast the value. */
36422 if (all_same
36423 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
36424 XVECEXP (vals, 0, 0)))
36425 return;
36427 /* Values where only one field is non-constant are best loaded from
36428 the pool and overwritten via move later. */
36429 if (n_var == 1)
36431 if (all_const_zero
36432 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
36433 XVECEXP (vals, 0, one_var),
36434 one_var))
36435 return;
36437 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
36438 return;
36441 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
36444 void
36445 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
36447 enum machine_mode mode = GET_MODE (target);
36448 enum machine_mode inner_mode = GET_MODE_INNER (mode);
36449 enum machine_mode half_mode;
36450 bool use_vec_merge = false;
36451 rtx tmp;
36452 static rtx (*gen_extract[6][2]) (rtx, rtx)
36454 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
36455 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
36456 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
36457 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
36458 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
36459 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
36461 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
36463 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
36464 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
36465 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
36466 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
36467 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
36468 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
36470 int i, j, n;
36472 switch (mode)
36474 case V2SFmode:
36475 case V2SImode:
36476 if (mmx_ok)
36478 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
36479 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
36480 if (elt == 0)
36481 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
36482 else
36483 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
36484 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36485 return;
36487 break;
36489 case V2DImode:
36490 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
36491 if (use_vec_merge)
36492 break;
36494 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
36495 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
36496 if (elt == 0)
36497 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
36498 else
36499 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
36500 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36501 return;
36503 case V2DFmode:
36505 rtx op0, op1;
36507 /* For the two element vectors, we implement a VEC_CONCAT with
36508 the extraction of the other element. */
36510 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
36511 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
36513 if (elt == 0)
36514 op0 = val, op1 = tmp;
36515 else
36516 op0 = tmp, op1 = val;
36518 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
36519 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36521 return;
36523 case V4SFmode:
36524 use_vec_merge = TARGET_SSE4_1;
36525 if (use_vec_merge)
36526 break;
36528 switch (elt)
36530 case 0:
36531 use_vec_merge = true;
36532 break;
36534 case 1:
36535 /* tmp = target = A B C D */
36536 tmp = copy_to_reg (target);
36537 /* target = A A B B */
36538 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
36539 /* target = X A B B */
36540 ix86_expand_vector_set (false, target, val, 0);
36541 /* target = A X C D */
36542 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
36543 const1_rtx, const0_rtx,
36544 GEN_INT (2+4), GEN_INT (3+4)));
36545 return;
36547 case 2:
36548 /* tmp = target = A B C D */
36549 tmp = copy_to_reg (target);
36550 /* tmp = X B C D */
36551 ix86_expand_vector_set (false, tmp, val, 0);
36552 /* target = A B X D */
36553 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
36554 const0_rtx, const1_rtx,
36555 GEN_INT (0+4), GEN_INT (3+4)));
36556 return;
36558 case 3:
36559 /* tmp = target = A B C D */
36560 tmp = copy_to_reg (target);
36561 /* tmp = X B C D */
36562 ix86_expand_vector_set (false, tmp, val, 0);
36563 /* target = A B X D */
36564 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
36565 const0_rtx, const1_rtx,
36566 GEN_INT (2+4), GEN_INT (0+4)));
36567 return;
36569 default:
36570 gcc_unreachable ();
36572 break;
36574 case V4SImode:
36575 use_vec_merge = TARGET_SSE4_1;
36576 if (use_vec_merge)
36577 break;
36579 /* Element 0 handled by vec_merge below. */
36580 if (elt == 0)
36582 use_vec_merge = true;
36583 break;
36586 if (TARGET_SSE2)
36588 /* With SSE2, use integer shuffles to swap element 0 and ELT,
36589 store into element 0, then shuffle them back. */
36591 rtx order[4];
36593 order[0] = GEN_INT (elt);
36594 order[1] = const1_rtx;
36595 order[2] = const2_rtx;
36596 order[3] = GEN_INT (3);
36597 order[elt] = const0_rtx;
36599 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
36600 order[1], order[2], order[3]));
36602 ix86_expand_vector_set (false, target, val, 0);
36604 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
36605 order[1], order[2], order[3]));
36607 else
36609 /* For SSE1, we have to reuse the V4SF code. */
36610 ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
36611 gen_lowpart (SFmode, val), elt);
36613 return;
36615 case V8HImode:
36616 use_vec_merge = TARGET_SSE2;
36617 break;
36618 case V4HImode:
36619 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
36620 break;
36622 case V16QImode:
36623 use_vec_merge = TARGET_SSE4_1;
36624 break;
36626 case V8QImode:
36627 break;
36629 case V32QImode:
36630 half_mode = V16QImode;
36631 j = 0;
36632 n = 16;
36633 goto half;
36635 case V16HImode:
36636 half_mode = V8HImode;
36637 j = 1;
36638 n = 8;
36639 goto half;
36641 case V8SImode:
36642 half_mode = V4SImode;
36643 j = 2;
36644 n = 4;
36645 goto half;
36647 case V4DImode:
36648 half_mode = V2DImode;
36649 j = 3;
36650 n = 2;
36651 goto half;
36653 case V8SFmode:
36654 half_mode = V4SFmode;
36655 j = 4;
36656 n = 4;
36657 goto half;
36659 case V4DFmode:
36660 half_mode = V2DFmode;
36661 j = 5;
36662 n = 2;
36663 goto half;
36665 half:
36666 /* Compute offset. */
36667 i = elt / n;
36668 elt %= n;
36670 gcc_assert (i <= 1);
36672 /* Extract the half. */
36673 tmp = gen_reg_rtx (half_mode);
36674 emit_insn (gen_extract[j][i] (tmp, target));
36676 /* Put val in tmp at elt. */
36677 ix86_expand_vector_set (false, tmp, val, elt);
36679 /* Put it back. */
36680 emit_insn (gen_insert[j][i] (target, target, tmp));
36681 return;
36683 default:
36684 break;
36687 if (use_vec_merge)
36689 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
36690 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
36691 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36693 else
36695 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
36697 emit_move_insn (mem, target);
36699 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
36700 emit_move_insn (tmp, val);
36702 emit_move_insn (target, mem);
36706 void
36707 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
36709 enum machine_mode mode = GET_MODE (vec);
36710 enum machine_mode inner_mode = GET_MODE_INNER (mode);
36711 bool use_vec_extr = false;
36712 rtx tmp;
36714 switch (mode)
36716 case V2SImode:
36717 case V2SFmode:
36718 if (!mmx_ok)
36719 break;
36720 /* FALLTHRU */
36722 case V2DFmode:
36723 case V2DImode:
36724 use_vec_extr = true;
36725 break;
36727 case V4SFmode:
36728 use_vec_extr = TARGET_SSE4_1;
36729 if (use_vec_extr)
36730 break;
36732 switch (elt)
36734 case 0:
36735 tmp = vec;
36736 break;
36738 case 1:
36739 case 3:
36740 tmp = gen_reg_rtx (mode);
36741 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
36742 GEN_INT (elt), GEN_INT (elt),
36743 GEN_INT (elt+4), GEN_INT (elt+4)));
36744 break;
36746 case 2:
36747 tmp = gen_reg_rtx (mode);
36748 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
36749 break;
36751 default:
36752 gcc_unreachable ();
36754 vec = tmp;
36755 use_vec_extr = true;
36756 elt = 0;
36757 break;
36759 case V4SImode:
36760 use_vec_extr = TARGET_SSE4_1;
36761 if (use_vec_extr)
36762 break;
36764 if (TARGET_SSE2)
36766 switch (elt)
36768 case 0:
36769 tmp = vec;
36770 break;
36772 case 1:
36773 case 3:
36774 tmp = gen_reg_rtx (mode);
36775 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
36776 GEN_INT (elt), GEN_INT (elt),
36777 GEN_INT (elt), GEN_INT (elt)));
36778 break;
36780 case 2:
36781 tmp = gen_reg_rtx (mode);
36782 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
36783 break;
36785 default:
36786 gcc_unreachable ();
36788 vec = tmp;
36789 use_vec_extr = true;
36790 elt = 0;
36792 else
36794 /* For SSE1, we have to reuse the V4SF code. */
36795 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
36796 gen_lowpart (V4SFmode, vec), elt);
36797 return;
36799 break;
36801 case V8HImode:
36802 use_vec_extr = TARGET_SSE2;
36803 break;
36804 case V4HImode:
36805 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
36806 break;
36808 case V16QImode:
36809 use_vec_extr = TARGET_SSE4_1;
36810 break;
36812 case V8SFmode:
36813 if (TARGET_AVX)
36815 tmp = gen_reg_rtx (V4SFmode);
36816 if (elt < 4)
36817 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
36818 else
36819 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
36820 ix86_expand_vector_extract (false, target, tmp, elt & 3);
36821 return;
36823 break;
36825 case V4DFmode:
36826 if (TARGET_AVX)
36828 tmp = gen_reg_rtx (V2DFmode);
36829 if (elt < 2)
36830 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
36831 else
36832 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
36833 ix86_expand_vector_extract (false, target, tmp, elt & 1);
36834 return;
36836 break;
36838 case V32QImode:
36839 if (TARGET_AVX)
36841 tmp = gen_reg_rtx (V16QImode);
36842 if (elt < 16)
36843 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
36844 else
36845 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
36846 ix86_expand_vector_extract (false, target, tmp, elt & 15);
36847 return;
36849 break;
36851 case V16HImode:
36852 if (TARGET_AVX)
36854 tmp = gen_reg_rtx (V8HImode);
36855 if (elt < 8)
36856 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
36857 else
36858 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
36859 ix86_expand_vector_extract (false, target, tmp, elt & 7);
36860 return;
36862 break;
36864 case V8SImode:
36865 if (TARGET_AVX)
36867 tmp = gen_reg_rtx (V4SImode);
36868 if (elt < 4)
36869 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
36870 else
36871 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
36872 ix86_expand_vector_extract (false, target, tmp, elt & 3);
36873 return;
36875 break;
36877 case V4DImode:
36878 if (TARGET_AVX)
36880 tmp = gen_reg_rtx (V2DImode);
36881 if (elt < 2)
36882 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
36883 else
36884 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
36885 ix86_expand_vector_extract (false, target, tmp, elt & 1);
36886 return;
36888 break;
36890 case V8QImode:
36891 /* ??? Could extract the appropriate HImode element and shift. */
36892 default:
36893 break;
36896 if (use_vec_extr)
36898 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
36899 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
36901 /* Let the rtl optimizers know about the zero extension performed. */
36902 if (inner_mode == QImode || inner_mode == HImode)
36904 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
36905 target = gen_lowpart (SImode, target);
36908 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36910 else
36912 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
36914 emit_move_insn (mem, vec);
36916 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
36917 emit_move_insn (target, tmp);
36921 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
36922 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
36923 The upper bits of DEST are undefined, though they shouldn't cause
36924 exceptions (some bits from src or all zeros are ok). */
36926 static void
36927 emit_reduc_half (rtx dest, rtx src, int i)
36929 rtx tem;
36930 switch (GET_MODE (src))
36932 case V4SFmode:
36933 if (i == 128)
36934 tem = gen_sse_movhlps (dest, src, src);
36935 else
36936 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
36937 GEN_INT (1 + 4), GEN_INT (1 + 4));
36938 break;
36939 case V2DFmode:
36940 tem = gen_vec_interleave_highv2df (dest, src, src);
36941 break;
36942 case V16QImode:
36943 case V8HImode:
36944 case V4SImode:
36945 case V2DImode:
36946 tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest),
36947 gen_lowpart (V1TImode, src),
36948 GEN_INT (i / 2));
36949 break;
36950 case V8SFmode:
36951 if (i == 256)
36952 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
36953 else
36954 tem = gen_avx_shufps256 (dest, src, src,
36955 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
36956 break;
36957 case V4DFmode:
36958 if (i == 256)
36959 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
36960 else
36961 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
36962 break;
36963 case V32QImode:
36964 case V16HImode:
36965 case V8SImode:
36966 case V4DImode:
36967 if (i == 256)
36968 tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest),
36969 gen_lowpart (V4DImode, src),
36970 gen_lowpart (V4DImode, src),
36971 const1_rtx);
36972 else
36973 tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest),
36974 gen_lowpart (V2TImode, src),
36975 GEN_INT (i / 2));
36976 break;
36977 default:
36978 gcc_unreachable ();
36980 emit_insn (tem);
36983 /* Expand a vector reduction. FN is the binary pattern to reduce;
36984 DEST is the destination; IN is the input vector. */
36986 void
36987 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
36989 rtx half, dst, vec = in;
36990 enum machine_mode mode = GET_MODE (in);
36991 int i;
36993 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
36994 if (TARGET_SSE4_1
36995 && mode == V8HImode
36996 && fn == gen_uminv8hi3)
36998 emit_insn (gen_sse4_1_phminposuw (dest, in));
36999 return;
37002 for (i = GET_MODE_BITSIZE (mode);
37003 i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
37004 i >>= 1)
37006 half = gen_reg_rtx (mode);
37007 emit_reduc_half (half, vec, i);
37008 if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
37009 dst = dest;
37010 else
37011 dst = gen_reg_rtx (mode);
37012 emit_insn (fn (dst, half, vec));
37013 vec = dst;
37017 /* Target hook for scalar_mode_supported_p. */
37018 static bool
37019 ix86_scalar_mode_supported_p (enum machine_mode mode)
37021 if (DECIMAL_FLOAT_MODE_P (mode))
37022 return default_decimal_float_supported_p ();
37023 else if (mode == TFmode)
37024 return true;
37025 else
37026 return default_scalar_mode_supported_p (mode);
37029 /* Implements target hook vector_mode_supported_p. */
37030 static bool
37031 ix86_vector_mode_supported_p (enum machine_mode mode)
37033 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
37034 return true;
37035 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
37036 return true;
37037 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
37038 return true;
37039 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
37040 return true;
37041 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
37042 return true;
37043 return false;
37046 /* Target hook for c_mode_for_suffix. */
37047 static enum machine_mode
37048 ix86_c_mode_for_suffix (char suffix)
37050 if (suffix == 'q')
37051 return TFmode;
37052 if (suffix == 'w')
37053 return XFmode;
37055 return VOIDmode;
37058 /* Worker function for TARGET_MD_ASM_CLOBBERS.
37060 We do this in the new i386 backend to maintain source compatibility
37061 with the old cc0-based compiler. */
37063 static tree
37064 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
37065 tree inputs ATTRIBUTE_UNUSED,
37066 tree clobbers)
37068 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
37069 clobbers);
37070 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
37071 clobbers);
37072 return clobbers;
37075 /* Implements target vector targetm.asm.encode_section_info. */
37077 static void ATTRIBUTE_UNUSED
37078 ix86_encode_section_info (tree decl, rtx rtl, int first)
37080 default_encode_section_info (decl, rtl, first);
37082 if (TREE_CODE (decl) == VAR_DECL
37083 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
37084 && ix86_in_large_data_p (decl))
37085 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
37088 /* Worker function for REVERSE_CONDITION. */
37090 enum rtx_code
37091 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
37093 return (mode != CCFPmode && mode != CCFPUmode
37094 ? reverse_condition (code)
37095 : reverse_condition_maybe_unordered (code));
37098 /* Output code to perform an x87 FP register move, from OPERANDS[1]
37099 to OPERANDS[0]. */
37101 const char *
37102 output_387_reg_move (rtx insn, rtx *operands)
37104 if (REG_P (operands[0]))
37106 if (REG_P (operands[1])
37107 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
37109 if (REGNO (operands[0]) == FIRST_STACK_REG)
37110 return output_387_ffreep (operands, 0);
37111 return "fstp\t%y0";
37113 if (STACK_TOP_P (operands[0]))
37114 return "fld%Z1\t%y1";
37115 return "fst\t%y0";
37117 else if (MEM_P (operands[0]))
37119 gcc_assert (REG_P (operands[1]));
37120 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
37121 return "fstp%Z0\t%y0";
37122 else
37124 /* There is no non-popping store to memory for XFmode.
37125 So if we need one, follow the store with a load. */
37126 if (GET_MODE (operands[0]) == XFmode)
37127 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
37128 else
37129 return "fst%Z0\t%y0";
37132 else
37133 gcc_unreachable();
37136 /* Output code to perform a conditional jump to LABEL, if C2 flag in
37137 FP status register is set. */
37139 void
37140 ix86_emit_fp_unordered_jump (rtx label)
37142 rtx reg = gen_reg_rtx (HImode);
37143 rtx temp;
37145 emit_insn (gen_x86_fnstsw_1 (reg));
37147 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
37149 emit_insn (gen_x86_sahf_1 (reg));
37151 temp = gen_rtx_REG (CCmode, FLAGS_REG);
37152 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
37154 else
37156 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
37158 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
37159 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
37162 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
37163 gen_rtx_LABEL_REF (VOIDmode, label),
37164 pc_rtx);
37165 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
37167 emit_jump_insn (temp);
37168 predict_jump (REG_BR_PROB_BASE * 10 / 100);
37171 /* Output code to perform a log1p XFmode calculation. */
37173 void ix86_emit_i387_log1p (rtx op0, rtx op1)
37175 rtx label1 = gen_label_rtx ();
37176 rtx label2 = gen_label_rtx ();
37178 rtx tmp = gen_reg_rtx (XFmode);
37179 rtx tmp2 = gen_reg_rtx (XFmode);
37180 rtx test;
37182 emit_insn (gen_absxf2 (tmp, op1));
37183 test = gen_rtx_GE (VOIDmode, tmp,
37184 CONST_DOUBLE_FROM_REAL_VALUE (
37185 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
37186 XFmode));
37187 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
37189 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
37190 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
37191 emit_jump (label2);
37193 emit_label (label1);
37194 emit_move_insn (tmp, CONST1_RTX (XFmode));
37195 emit_insn (gen_addxf3 (tmp, op1, tmp));
37196 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
37197 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
37199 emit_label (label2);
37202 /* Emit code for round calculation. */
37203 void ix86_emit_i387_round (rtx op0, rtx op1)
37205 enum machine_mode inmode = GET_MODE (op1);
37206 enum machine_mode outmode = GET_MODE (op0);
37207 rtx e1, e2, res, tmp, tmp1, half;
37208 rtx scratch = gen_reg_rtx (HImode);
37209 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
37210 rtx jump_label = gen_label_rtx ();
37211 rtx insn;
37212 rtx (*gen_abs) (rtx, rtx);
37213 rtx (*gen_neg) (rtx, rtx);
37215 switch (inmode)
37217 case SFmode:
37218 gen_abs = gen_abssf2;
37219 break;
37220 case DFmode:
37221 gen_abs = gen_absdf2;
37222 break;
37223 case XFmode:
37224 gen_abs = gen_absxf2;
37225 break;
37226 default:
37227 gcc_unreachable ();
37230 switch (outmode)
37232 case SFmode:
37233 gen_neg = gen_negsf2;
37234 break;
37235 case DFmode:
37236 gen_neg = gen_negdf2;
37237 break;
37238 case XFmode:
37239 gen_neg = gen_negxf2;
37240 break;
37241 case HImode:
37242 gen_neg = gen_neghi2;
37243 break;
37244 case SImode:
37245 gen_neg = gen_negsi2;
37246 break;
37247 case DImode:
37248 gen_neg = gen_negdi2;
37249 break;
37250 default:
37251 gcc_unreachable ();
37254 e1 = gen_reg_rtx (inmode);
37255 e2 = gen_reg_rtx (inmode);
37256 res = gen_reg_rtx (outmode);
37258 half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
37260 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
37262 /* scratch = fxam(op1) */
37263 emit_insn (gen_rtx_SET (VOIDmode, scratch,
37264 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
37265 UNSPEC_FXAM)));
37266 /* e1 = fabs(op1) */
37267 emit_insn (gen_abs (e1, op1));
37269 /* e2 = e1 + 0.5 */
37270 half = force_reg (inmode, half);
37271 emit_insn (gen_rtx_SET (VOIDmode, e2,
37272 gen_rtx_PLUS (inmode, e1, half)));
37274 /* res = floor(e2) */
37275 if (inmode != XFmode)
37277 tmp1 = gen_reg_rtx (XFmode);
37279 emit_insn (gen_rtx_SET (VOIDmode, tmp1,
37280 gen_rtx_FLOAT_EXTEND (XFmode, e2)));
37282 else
37283 tmp1 = e2;
37285 switch (outmode)
37287 case SFmode:
37288 case DFmode:
37290 rtx tmp0 = gen_reg_rtx (XFmode);
37292 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
37294 emit_insn (gen_rtx_SET (VOIDmode, res,
37295 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
37296 UNSPEC_TRUNC_NOOP)));
37298 break;
37299 case XFmode:
37300 emit_insn (gen_frndintxf2_floor (res, tmp1));
37301 break;
37302 case HImode:
37303 emit_insn (gen_lfloorxfhi2 (res, tmp1));
37304 break;
37305 case SImode:
37306 emit_insn (gen_lfloorxfsi2 (res, tmp1));
37307 break;
37308 case DImode:
37309 emit_insn (gen_lfloorxfdi2 (res, tmp1));
37310 break;
37311 default:
37312 gcc_unreachable ();
37315 /* flags = signbit(a) */
37316 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
37318 /* if (flags) then res = -res */
37319 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
37320 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
37321 gen_rtx_LABEL_REF (VOIDmode, jump_label),
37322 pc_rtx);
37323 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
37324 predict_jump (REG_BR_PROB_BASE * 50 / 100);
37325 JUMP_LABEL (insn) = jump_label;
37327 emit_insn (gen_neg (res, res));
37329 emit_label (jump_label);
37330 LABEL_NUSES (jump_label) = 1;
37332 emit_move_insn (op0, res);
37335 /* Output code to perform a Newton-Rhapson approximation of a single precision
37336 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
37338 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
37340 rtx x0, x1, e0, e1;
37342 x0 = gen_reg_rtx (mode);
37343 e0 = gen_reg_rtx (mode);
37344 e1 = gen_reg_rtx (mode);
37345 x1 = gen_reg_rtx (mode);
37347 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
37349 b = force_reg (mode, b);
37351 /* x0 = rcp(b) estimate */
37352 emit_insn (gen_rtx_SET (VOIDmode, x0,
37353 gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
37354 UNSPEC_RCP)));
37355 /* e0 = x0 * b */
37356 emit_insn (gen_rtx_SET (VOIDmode, e0,
37357 gen_rtx_MULT (mode, x0, b)));
37359 /* e0 = x0 * e0 */
37360 emit_insn (gen_rtx_SET (VOIDmode, e0,
37361 gen_rtx_MULT (mode, x0, e0)));
37363 /* e1 = x0 + x0 */
37364 emit_insn (gen_rtx_SET (VOIDmode, e1,
37365 gen_rtx_PLUS (mode, x0, x0)));
37367 /* x1 = e1 - e0 */
37368 emit_insn (gen_rtx_SET (VOIDmode, x1,
37369 gen_rtx_MINUS (mode, e1, e0)));
37371 /* res = a * x1 */
37372 emit_insn (gen_rtx_SET (VOIDmode, res,
37373 gen_rtx_MULT (mode, a, x1)));
37376 /* Output code to perform a Newton-Rhapson approximation of a
37377 single precision floating point [reciprocal] square root. */
37379 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
37380 bool recip)
37382 rtx x0, e0, e1, e2, e3, mthree, mhalf;
37383 REAL_VALUE_TYPE r;
37385 x0 = gen_reg_rtx (mode);
37386 e0 = gen_reg_rtx (mode);
37387 e1 = gen_reg_rtx (mode);
37388 e2 = gen_reg_rtx (mode);
37389 e3 = gen_reg_rtx (mode);
37391 real_from_integer (&r, VOIDmode, -3, -1, 0);
37392 mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
37394 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
37395 mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
37397 if (VECTOR_MODE_P (mode))
37399 mthree = ix86_build_const_vector (mode, true, mthree);
37400 mhalf = ix86_build_const_vector (mode, true, mhalf);
37403 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
37404 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
37406 a = force_reg (mode, a);
37408 /* x0 = rsqrt(a) estimate */
37409 emit_insn (gen_rtx_SET (VOIDmode, x0,
37410 gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
37411 UNSPEC_RSQRT)));
37413 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
37414 if (!recip)
37416 rtx zero, mask;
37418 zero = gen_reg_rtx (mode);
37419 mask = gen_reg_rtx (mode);
37421 zero = force_reg (mode, CONST0_RTX(mode));
37422 emit_insn (gen_rtx_SET (VOIDmode, mask,
37423 gen_rtx_NE (mode, zero, a)));
37425 emit_insn (gen_rtx_SET (VOIDmode, x0,
37426 gen_rtx_AND (mode, x0, mask)));
37429 /* e0 = x0 * a */
37430 emit_insn (gen_rtx_SET (VOIDmode, e0,
37431 gen_rtx_MULT (mode, x0, a)));
37432 /* e1 = e0 * x0 */
37433 emit_insn (gen_rtx_SET (VOIDmode, e1,
37434 gen_rtx_MULT (mode, e0, x0)));
37436 /* e2 = e1 - 3. */
37437 mthree = force_reg (mode, mthree);
37438 emit_insn (gen_rtx_SET (VOIDmode, e2,
37439 gen_rtx_PLUS (mode, e1, mthree)));
37441 mhalf = force_reg (mode, mhalf);
37442 if (recip)
37443 /* e3 = -.5 * x0 */
37444 emit_insn (gen_rtx_SET (VOIDmode, e3,
37445 gen_rtx_MULT (mode, x0, mhalf)));
37446 else
37447 /* e3 = -.5 * e0 */
37448 emit_insn (gen_rtx_SET (VOIDmode, e3,
37449 gen_rtx_MULT (mode, e0, mhalf)));
37450 /* ret = e2 * e3 */
37451 emit_insn (gen_rtx_SET (VOIDmode, res,
37452 gen_rtx_MULT (mode, e2, e3)));
37455 #ifdef TARGET_SOLARIS
37456 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
37458 static void
37459 i386_solaris_elf_named_section (const char *name, unsigned int flags,
37460 tree decl)
37462 /* With Binutils 2.15, the "@unwind" marker must be specified on
37463 every occurrence of the ".eh_frame" section, not just the first
37464 one. */
37465 if (TARGET_64BIT
37466 && strcmp (name, ".eh_frame") == 0)
37468 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
37469 flags & SECTION_WRITE ? "aw" : "a");
37470 return;
37473 #ifndef USE_GAS
37474 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
37476 solaris_elf_asm_comdat_section (name, flags, decl);
37477 return;
37479 #endif
37481 default_elf_asm_named_section (name, flags, decl);
37483 #endif /* TARGET_SOLARIS */
37485 /* Return the mangling of TYPE if it is an extended fundamental type. */
37487 static const char *
37488 ix86_mangle_type (const_tree type)
37490 type = TYPE_MAIN_VARIANT (type);
37492 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
37493 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
37494 return NULL;
37496 switch (TYPE_MODE (type))
37498 case TFmode:
37499 /* __float128 is "g". */
37500 return "g";
37501 case XFmode:
37502 /* "long double" or __float80 is "e". */
37503 return "e";
37504 default:
37505 return NULL;
37509 /* For 32-bit code we can save PIC register setup by using
37510 __stack_chk_fail_local hidden function instead of calling
37511 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
37512 register, so it is better to call __stack_chk_fail directly. */
37514 static tree ATTRIBUTE_UNUSED
37515 ix86_stack_protect_fail (void)
37517 return TARGET_64BIT
37518 ? default_external_stack_protect_fail ()
37519 : default_hidden_stack_protect_fail ();
37522 /* Select a format to encode pointers in exception handling data. CODE
37523 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
37524 true if the symbol may be affected by dynamic relocations.
37526 ??? All x86 object file formats are capable of representing this.
37527 After all, the relocation needed is the same as for the call insn.
37528 Whether or not a particular assembler allows us to enter such, I
37529 guess we'll have to see. */
37531 asm_preferred_eh_data_format (int code, int global)
37533 if (flag_pic)
37535 int type = DW_EH_PE_sdata8;
37536 if (!TARGET_64BIT
37537 || ix86_cmodel == CM_SMALL_PIC
37538 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
37539 type = DW_EH_PE_sdata4;
37540 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
37542 if (ix86_cmodel == CM_SMALL
37543 || (ix86_cmodel == CM_MEDIUM && code))
37544 return DW_EH_PE_udata4;
37545 return DW_EH_PE_absptr;
37548 /* Expand copysign from SIGN to the positive value ABS_VALUE
37549 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
37550 the sign-bit. */
37551 static void
37552 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
37554 enum machine_mode mode = GET_MODE (sign);
37555 rtx sgn = gen_reg_rtx (mode);
37556 if (mask == NULL_RTX)
37558 enum machine_mode vmode;
37560 if (mode == SFmode)
37561 vmode = V4SFmode;
37562 else if (mode == DFmode)
37563 vmode = V2DFmode;
37564 else
37565 vmode = mode;
37567 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
37568 if (!VECTOR_MODE_P (mode))
37570 /* We need to generate a scalar mode mask in this case. */
37571 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
37572 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
37573 mask = gen_reg_rtx (mode);
37574 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
37577 else
37578 mask = gen_rtx_NOT (mode, mask);
37579 emit_insn (gen_rtx_SET (VOIDmode, sgn,
37580 gen_rtx_AND (mode, mask, sign)));
37581 emit_insn (gen_rtx_SET (VOIDmode, result,
37582 gen_rtx_IOR (mode, abs_value, sgn)));
37585 /* Expand fabs (OP0) and return a new rtx that holds the result. The
37586 mask for masking out the sign-bit is stored in *SMASK, if that is
37587 non-null. */
37588 static rtx
37589 ix86_expand_sse_fabs (rtx op0, rtx *smask)
37591 enum machine_mode vmode, mode = GET_MODE (op0);
37592 rtx xa, mask;
37594 xa = gen_reg_rtx (mode);
37595 if (mode == SFmode)
37596 vmode = V4SFmode;
37597 else if (mode == DFmode)
37598 vmode = V2DFmode;
37599 else
37600 vmode = mode;
37601 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
37602 if (!VECTOR_MODE_P (mode))
37604 /* We need to generate a scalar mode mask in this case. */
37605 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
37606 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
37607 mask = gen_reg_rtx (mode);
37608 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
37610 emit_insn (gen_rtx_SET (VOIDmode, xa,
37611 gen_rtx_AND (mode, op0, mask)));
37613 if (smask)
37614 *smask = mask;
37616 return xa;
37619 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
37620 swapping the operands if SWAP_OPERANDS is true. The expanded
37621 code is a forward jump to a newly created label in case the
37622 comparison is true. The generated label rtx is returned. */
37623 static rtx
37624 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
37625 bool swap_operands)
37627 rtx label, tmp;
37629 if (swap_operands)
37631 tmp = op0;
37632 op0 = op1;
37633 op1 = tmp;
37636 label = gen_label_rtx ();
37637 tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG);
37638 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37639 gen_rtx_COMPARE (CCFPUmode, op0, op1)));
37640 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
37641 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
37642 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
37643 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
37644 JUMP_LABEL (tmp) = label;
37646 return label;
37649 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
37650 using comparison code CODE. Operands are swapped for the comparison if
37651 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
37652 static rtx
37653 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
37654 bool swap_operands)
37656 rtx (*insn)(rtx, rtx, rtx, rtx);
37657 enum machine_mode mode = GET_MODE (op0);
37658 rtx mask = gen_reg_rtx (mode);
37660 if (swap_operands)
37662 rtx tmp = op0;
37663 op0 = op1;
37664 op1 = tmp;
37667 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
37669 emit_insn (insn (mask, op0, op1,
37670 gen_rtx_fmt_ee (code, mode, op0, op1)));
37671 return mask;
37674 /* Generate and return a rtx of mode MODE for 2**n where n is the number
37675 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
37676 static rtx
37677 ix86_gen_TWO52 (enum machine_mode mode)
37679 REAL_VALUE_TYPE TWO52r;
37680 rtx TWO52;
37682 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
37683 TWO52 = const_double_from_real_value (TWO52r, mode);
37684 TWO52 = force_reg (mode, TWO52);
37686 return TWO52;
37689 /* Expand SSE sequence for computing lround from OP1 storing
37690 into OP0. */
37691 void
37692 ix86_expand_lround (rtx op0, rtx op1)
37694 /* C code for the stuff we're doing below:
37695 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
37696 return (long)tmp;
37698 enum machine_mode mode = GET_MODE (op1);
37699 const struct real_format *fmt;
37700 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
37701 rtx adj;
37703 /* load nextafter (0.5, 0.0) */
37704 fmt = REAL_MODE_FORMAT (mode);
37705 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
37706 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
37708 /* adj = copysign (0.5, op1) */
37709 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
37710 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
37712 /* adj = op1 + adj */
37713 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
37715 /* op0 = (imode)adj */
37716 expand_fix (op0, adj, 0);
37719 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
37720 into OPERAND0. */
37721 void
37722 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
37724 /* C code for the stuff we're doing below (for do_floor):
37725 xi = (long)op1;
37726 xi -= (double)xi > op1 ? 1 : 0;
37727 return xi;
37729 enum machine_mode fmode = GET_MODE (op1);
37730 enum machine_mode imode = GET_MODE (op0);
37731 rtx ireg, freg, label, tmp;
37733 /* reg = (long)op1 */
37734 ireg = gen_reg_rtx (imode);
37735 expand_fix (ireg, op1, 0);
37737 /* freg = (double)reg */
37738 freg = gen_reg_rtx (fmode);
37739 expand_float (freg, ireg, 0);
37741 /* ireg = (freg > op1) ? ireg - 1 : ireg */
37742 label = ix86_expand_sse_compare_and_jump (UNLE,
37743 freg, op1, !do_floor);
37744 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
37745 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
37746 emit_move_insn (ireg, tmp);
37748 emit_label (label);
37749 LABEL_NUSES (label) = 1;
37751 emit_move_insn (op0, ireg);
37754 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
37755 result in OPERAND0. */
37756 void
37757 ix86_expand_rint (rtx operand0, rtx operand1)
37759 /* C code for the stuff we're doing below:
37760 xa = fabs (operand1);
37761 if (!isless (xa, 2**52))
37762 return operand1;
37763 xa = xa + 2**52 - 2**52;
37764 return copysign (xa, operand1);
37766 enum machine_mode mode = GET_MODE (operand0);
37767 rtx res, xa, label, TWO52, mask;
37769 res = gen_reg_rtx (mode);
37770 emit_move_insn (res, operand1);
37772 /* xa = abs (operand1) */
37773 xa = ix86_expand_sse_fabs (res, &mask);
37775 /* if (!isless (xa, TWO52)) goto label; */
37776 TWO52 = ix86_gen_TWO52 (mode);
37777 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37779 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
37780 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
37782 ix86_sse_copysign_to_positive (res, xa, res, mask);
37784 emit_label (label);
37785 LABEL_NUSES (label) = 1;
37787 emit_move_insn (operand0, res);
37790 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
37791 into OPERAND0. */
37792 void
37793 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
37795 /* C code for the stuff we expand below.
37796 double xa = fabs (x), x2;
37797 if (!isless (xa, TWO52))
37798 return x;
37799 xa = xa + TWO52 - TWO52;
37800 x2 = copysign (xa, x);
37801 Compensate. Floor:
37802 if (x2 > x)
37803 x2 -= 1;
37804 Compensate. Ceil:
37805 if (x2 < x)
37806 x2 -= -1;
37807 return x2;
37809 enum machine_mode mode = GET_MODE (operand0);
37810 rtx xa, TWO52, tmp, label, one, res, mask;
37812 TWO52 = ix86_gen_TWO52 (mode);
37814 /* Temporary for holding the result, initialized to the input
37815 operand to ease control flow. */
37816 res = gen_reg_rtx (mode);
37817 emit_move_insn (res, operand1);
37819 /* xa = abs (operand1) */
37820 xa = ix86_expand_sse_fabs (res, &mask);
37822 /* if (!isless (xa, TWO52)) goto label; */
37823 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37825 /* xa = xa + TWO52 - TWO52; */
37826 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
37827 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
37829 /* xa = copysign (xa, operand1) */
37830 ix86_sse_copysign_to_positive (xa, xa, res, mask);
37832 /* generate 1.0 or -1.0 */
37833 one = force_reg (mode,
37834 const_double_from_real_value (do_floor
37835 ? dconst1 : dconstm1, mode));
37837 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
37838 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
37839 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37840 gen_rtx_AND (mode, one, tmp)));
37841 /* We always need to subtract here to preserve signed zero. */
37842 tmp = expand_simple_binop (mode, MINUS,
37843 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
37844 emit_move_insn (res, tmp);
37846 emit_label (label);
37847 LABEL_NUSES (label) = 1;
37849 emit_move_insn (operand0, res);
37852 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
37853 into OPERAND0. */
37854 void
37855 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
37857 /* C code for the stuff we expand below.
37858 double xa = fabs (x), x2;
37859 if (!isless (xa, TWO52))
37860 return x;
37861 x2 = (double)(long)x;
37862 Compensate. Floor:
37863 if (x2 > x)
37864 x2 -= 1;
37865 Compensate. Ceil:
37866 if (x2 < x)
37867 x2 += 1;
37868 if (HONOR_SIGNED_ZEROS (mode))
37869 return copysign (x2, x);
37870 return x2;
37872 enum machine_mode mode = GET_MODE (operand0);
37873 rtx xa, xi, TWO52, tmp, label, one, res, mask;
37875 TWO52 = ix86_gen_TWO52 (mode);
37877 /* Temporary for holding the result, initialized to the input
37878 operand to ease control flow. */
37879 res = gen_reg_rtx (mode);
37880 emit_move_insn (res, operand1);
37882 /* xa = abs (operand1) */
37883 xa = ix86_expand_sse_fabs (res, &mask);
37885 /* if (!isless (xa, TWO52)) goto label; */
37886 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37888 /* xa = (double)(long)x */
37889 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
37890 expand_fix (xi, res, 0);
37891 expand_float (xa, xi, 0);
37893 /* generate 1.0 */
37894 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
37896 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
37897 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
37898 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37899 gen_rtx_AND (mode, one, tmp)));
37900 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
37901 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
37902 emit_move_insn (res, tmp);
37904 if (HONOR_SIGNED_ZEROS (mode))
37905 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
37907 emit_label (label);
37908 LABEL_NUSES (label) = 1;
37910 emit_move_insn (operand0, res);
37913 /* Expand SSE sequence for computing round from OPERAND1 storing
37914 into OPERAND0. Sequence that works without relying on DImode truncation
37915 via cvttsd2siq that is only available on 64bit targets. */
37916 void
37917 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
37919 /* C code for the stuff we expand below.
37920 double xa = fabs (x), xa2, x2;
37921 if (!isless (xa, TWO52))
37922 return x;
37923 Using the absolute value and copying back sign makes
37924 -0.0 -> -0.0 correct.
37925 xa2 = xa + TWO52 - TWO52;
37926 Compensate.
37927 dxa = xa2 - xa;
37928 if (dxa <= -0.5)
37929 xa2 += 1;
37930 else if (dxa > 0.5)
37931 xa2 -= 1;
37932 x2 = copysign (xa2, x);
37933 return x2;
37935 enum machine_mode mode = GET_MODE (operand0);
37936 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
37938 TWO52 = ix86_gen_TWO52 (mode);
37940 /* Temporary for holding the result, initialized to the input
37941 operand to ease control flow. */
37942 res = gen_reg_rtx (mode);
37943 emit_move_insn (res, operand1);
37945 /* xa = abs (operand1) */
37946 xa = ix86_expand_sse_fabs (res, &mask);
37948 /* if (!isless (xa, TWO52)) goto label; */
37949 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37951 /* xa2 = xa + TWO52 - TWO52; */
37952 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
37953 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
37955 /* dxa = xa2 - xa; */
37956 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
37958 /* generate 0.5, 1.0 and -0.5 */
37959 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
37960 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
37961 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
37962 0, OPTAB_DIRECT);
37964 /* Compensate. */
37965 tmp = gen_reg_rtx (mode);
37966 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
37967 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
37968 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37969 gen_rtx_AND (mode, one, tmp)));
37970 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
37971 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
37972 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
37973 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37974 gen_rtx_AND (mode, one, tmp)));
37975 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
37977 /* res = copysign (xa2, operand1) */
37978 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
37980 emit_label (label);
37981 LABEL_NUSES (label) = 1;
37983 emit_move_insn (operand0, res);
37986 /* Expand SSE sequence for computing trunc from OPERAND1 storing
37987 into OPERAND0. */
37988 void
37989 ix86_expand_trunc (rtx operand0, rtx operand1)
37991 /* C code for SSE variant we expand below.
37992 double xa = fabs (x), x2;
37993 if (!isless (xa, TWO52))
37994 return x;
37995 x2 = (double)(long)x;
37996 if (HONOR_SIGNED_ZEROS (mode))
37997 return copysign (x2, x);
37998 return x2;
38000 enum machine_mode mode = GET_MODE (operand0);
38001 rtx xa, xi, TWO52, label, res, mask;
38003 TWO52 = ix86_gen_TWO52 (mode);
38005 /* Temporary for holding the result, initialized to the input
38006 operand to ease control flow. */
38007 res = gen_reg_rtx (mode);
38008 emit_move_insn (res, operand1);
38010 /* xa = abs (operand1) */
38011 xa = ix86_expand_sse_fabs (res, &mask);
38013 /* if (!isless (xa, TWO52)) goto label; */
38014 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38016 /* x = (double)(long)x */
38017 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
38018 expand_fix (xi, res, 0);
38019 expand_float (res, xi, 0);
38021 if (HONOR_SIGNED_ZEROS (mode))
38022 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
38024 emit_label (label);
38025 LABEL_NUSES (label) = 1;
38027 emit_move_insn (operand0, res);
38030 /* Expand SSE sequence for computing trunc from OPERAND1 storing
38031 into OPERAND0. */
38032 void
38033 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
38035 enum machine_mode mode = GET_MODE (operand0);
38036 rtx xa, mask, TWO52, label, one, res, smask, tmp;
38038 /* C code for SSE variant we expand below.
38039 double xa = fabs (x), x2;
38040 if (!isless (xa, TWO52))
38041 return x;
38042 xa2 = xa + TWO52 - TWO52;
38043 Compensate:
38044 if (xa2 > xa)
38045 xa2 -= 1.0;
38046 x2 = copysign (xa2, x);
38047 return x2;
38050 TWO52 = ix86_gen_TWO52 (mode);
38052 /* Temporary for holding the result, initialized to the input
38053 operand to ease control flow. */
38054 res = gen_reg_rtx (mode);
38055 emit_move_insn (res, operand1);
38057 /* xa = abs (operand1) */
38058 xa = ix86_expand_sse_fabs (res, &smask);
38060 /* if (!isless (xa, TWO52)) goto label; */
38061 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38063 /* res = xa + TWO52 - TWO52; */
38064 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
38065 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
38066 emit_move_insn (res, tmp);
38068 /* generate 1.0 */
38069 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
38071 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
38072 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
38073 emit_insn (gen_rtx_SET (VOIDmode, mask,
38074 gen_rtx_AND (mode, mask, one)));
38075 tmp = expand_simple_binop (mode, MINUS,
38076 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
38077 emit_move_insn (res, tmp);
38079 /* res = copysign (res, operand1) */
38080 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
38082 emit_label (label);
38083 LABEL_NUSES (label) = 1;
38085 emit_move_insn (operand0, res);
38088 /* Expand SSE sequence for computing round from OPERAND1 storing
38089 into OPERAND0. */
38090 void
38091 ix86_expand_round (rtx operand0, rtx operand1)
38093 /* C code for the stuff we're doing below:
38094 double xa = fabs (x);
38095 if (!isless (xa, TWO52))
38096 return x;
38097 xa = (double)(long)(xa + nextafter (0.5, 0.0));
38098 return copysign (xa, x);
38100 enum machine_mode mode = GET_MODE (operand0);
38101 rtx res, TWO52, xa, label, xi, half, mask;
38102 const struct real_format *fmt;
38103 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
38105 /* Temporary for holding the result, initialized to the input
38106 operand to ease control flow. */
38107 res = gen_reg_rtx (mode);
38108 emit_move_insn (res, operand1);
38110 TWO52 = ix86_gen_TWO52 (mode);
38111 xa = ix86_expand_sse_fabs (res, &mask);
38112 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38114 /* load nextafter (0.5, 0.0) */
38115 fmt = REAL_MODE_FORMAT (mode);
38116 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
38117 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
38119 /* xa = xa + 0.5 */
38120 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
38121 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
38123 /* xa = (double)(int64_t)xa */
38124 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
38125 expand_fix (xi, xa, 0);
38126 expand_float (xa, xi, 0);
38128 /* res = copysign (xa, operand1) */
38129 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
38131 emit_label (label);
38132 LABEL_NUSES (label) = 1;
38134 emit_move_insn (operand0, res);
38137 /* Expand SSE sequence for computing round
38138 from OP1 storing into OP0 using sse4 round insn. */
38139 void
38140 ix86_expand_round_sse4 (rtx op0, rtx op1)
38142 enum machine_mode mode = GET_MODE (op0);
38143 rtx e1, e2, res, half;
38144 const struct real_format *fmt;
38145 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
38146 rtx (*gen_copysign) (rtx, rtx, rtx);
38147 rtx (*gen_round) (rtx, rtx, rtx);
38149 switch (mode)
38151 case SFmode:
38152 gen_copysign = gen_copysignsf3;
38153 gen_round = gen_sse4_1_roundsf2;
38154 break;
38155 case DFmode:
38156 gen_copysign = gen_copysigndf3;
38157 gen_round = gen_sse4_1_rounddf2;
38158 break;
38159 default:
38160 gcc_unreachable ();
38163 /* round (a) = trunc (a + copysign (0.5, a)) */
38165 /* load nextafter (0.5, 0.0) */
38166 fmt = REAL_MODE_FORMAT (mode);
38167 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
38168 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
38169 half = const_double_from_real_value (pred_half, mode);
38171 /* e1 = copysign (0.5, op1) */
38172 e1 = gen_reg_rtx (mode);
38173 emit_insn (gen_copysign (e1, half, op1));
38175 /* e2 = op1 + e1 */
38176 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
38178 /* res = trunc (e2) */
38179 res = gen_reg_rtx (mode);
38180 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
38182 emit_move_insn (op0, res);
38186 /* Table of valid machine attributes. */
38187 static const struct attribute_spec ix86_attribute_table[] =
38189 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
38190 affects_type_identity } */
38191 /* Stdcall attribute says callee is responsible for popping arguments
38192 if they are not variable. */
38193 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38194 true },
38195 /* Fastcall attribute says callee is responsible for popping arguments
38196 if they are not variable. */
38197 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38198 true },
38199 /* Thiscall attribute says callee is responsible for popping arguments
38200 if they are not variable. */
38201 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38202 true },
38203 /* Cdecl attribute says the callee is a normal C declaration */
38204 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38205 true },
38206 /* Regparm attribute specifies how many integer arguments are to be
38207 passed in registers. */
38208 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
38209 true },
38210 /* Sseregparm attribute says we are using x86_64 calling conventions
38211 for FP arguments. */
38212 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38213 true },
38214 /* The transactional memory builtins are implicitly regparm or fastcall
38215 depending on the ABI. Override the generic do-nothing attribute that
38216 these builtins were declared with. */
38217 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
38218 true },
38219 /* force_align_arg_pointer says this function realigns the stack at entry. */
38220 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
38221 false, true, true, ix86_handle_cconv_attribute, false },
38222 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
38223 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
38224 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
38225 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
38226 false },
38227 #endif
38228 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
38229 false },
38230 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
38231 false },
38232 #ifdef SUBTARGET_ATTRIBUTE_TABLE
38233 SUBTARGET_ATTRIBUTE_TABLE,
38234 #endif
38235 /* ms_abi and sysv_abi calling convention function attributes. */
38236 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
38237 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
38238 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
38239 false },
38240 { "callee_pop_aggregate_return", 1, 1, false, true, true,
38241 ix86_handle_callee_pop_aggregate_return, true },
38242 /* End element. */
38243 { NULL, 0, 0, false, false, false, NULL, false }
38246 /* Implement targetm.vectorize.builtin_vectorization_cost. */
38247 static int
38248 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
38249 tree vectype,
38250 int misalign ATTRIBUTE_UNUSED)
38252 unsigned elements;
38254 switch (type_of_cost)
38256 case scalar_stmt:
38257 return ix86_cost->scalar_stmt_cost;
38259 case scalar_load:
38260 return ix86_cost->scalar_load_cost;
38262 case scalar_store:
38263 return ix86_cost->scalar_store_cost;
38265 case vector_stmt:
38266 return ix86_cost->vec_stmt_cost;
38268 case vector_load:
38269 return ix86_cost->vec_align_load_cost;
38271 case vector_store:
38272 return ix86_cost->vec_store_cost;
38274 case vec_to_scalar:
38275 return ix86_cost->vec_to_scalar_cost;
38277 case scalar_to_vec:
38278 return ix86_cost->scalar_to_vec_cost;
38280 case unaligned_load:
38281 case unaligned_store:
38282 return ix86_cost->vec_unalign_load_cost;
38284 case cond_branch_taken:
38285 return ix86_cost->cond_taken_branch_cost;
38287 case cond_branch_not_taken:
38288 return ix86_cost->cond_not_taken_branch_cost;
38290 case vec_perm:
38291 case vec_promote_demote:
38292 return ix86_cost->vec_stmt_cost;
38294 case vec_construct:
38295 elements = TYPE_VECTOR_SUBPARTS (vectype);
38296 return elements / 2 + 1;
38298 default:
38299 gcc_unreachable ();
38303 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
38304 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
38305 insn every time. */
38307 static GTY(()) rtx vselect_insn;
38309 /* Initialize vselect_insn. */
38311 static void
38312 init_vselect_insn (void)
38314 unsigned i;
38315 rtx x;
38317 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
38318 for (i = 0; i < MAX_VECT_LEN; ++i)
38319 XVECEXP (x, 0, i) = const0_rtx;
38320 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
38321 const0_rtx), x);
38322 x = gen_rtx_SET (VOIDmode, const0_rtx, x);
38323 start_sequence ();
38324 vselect_insn = emit_insn (x);
38325 end_sequence ();
38328 /* Construct (set target (vec_select op0 (parallel perm))) and
38329 return true if that's a valid instruction in the active ISA. */
38331 static bool
38332 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
38333 unsigned nelt, bool testing_p)
38335 unsigned int i;
38336 rtx x, save_vconcat;
38337 int icode;
38339 if (vselect_insn == NULL_RTX)
38340 init_vselect_insn ();
38342 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
38343 PUT_NUM_ELEM (XVEC (x, 0), nelt);
38344 for (i = 0; i < nelt; ++i)
38345 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
38346 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
38347 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
38348 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
38349 SET_DEST (PATTERN (vselect_insn)) = target;
38350 icode = recog_memoized (vselect_insn);
38352 if (icode >= 0 && !testing_p)
38353 emit_insn (copy_rtx (PATTERN (vselect_insn)));
38355 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
38356 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
38357 INSN_CODE (vselect_insn) = -1;
38359 return icode >= 0;
38362 /* Similar, but generate a vec_concat from op0 and op1 as well. */
38364 static bool
38365 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
38366 const unsigned char *perm, unsigned nelt,
38367 bool testing_p)
38369 enum machine_mode v2mode;
38370 rtx x;
38371 bool ok;
38373 if (vselect_insn == NULL_RTX)
38374 init_vselect_insn ();
38376 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
38377 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
38378 PUT_MODE (x, v2mode);
38379 XEXP (x, 0) = op0;
38380 XEXP (x, 1) = op1;
38381 ok = expand_vselect (target, x, perm, nelt, testing_p);
38382 XEXP (x, 0) = const0_rtx;
38383 XEXP (x, 1) = const0_rtx;
38384 return ok;
38387 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
38388 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
38390 static bool
38391 expand_vec_perm_blend (struct expand_vec_perm_d *d)
38393 enum machine_mode vmode = d->vmode;
38394 unsigned i, mask, nelt = d->nelt;
38395 rtx target, op0, op1, x;
38396 rtx rperm[32], vperm;
38398 if (d->one_operand_p)
38399 return false;
38400 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
38402 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
38404 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
38406 else
38407 return false;
38409 /* This is a blend, not a permute. Elements must stay in their
38410 respective lanes. */
38411 for (i = 0; i < nelt; ++i)
38413 unsigned e = d->perm[i];
38414 if (!(e == i || e == i + nelt))
38415 return false;
38418 if (d->testing_p)
38419 return true;
38421 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
38422 decision should be extracted elsewhere, so that we only try that
38423 sequence once all budget==3 options have been tried. */
38424 target = d->target;
38425 op0 = d->op0;
38426 op1 = d->op1;
38427 mask = 0;
38429 switch (vmode)
38431 case V4DFmode:
38432 case V8SFmode:
38433 case V2DFmode:
38434 case V4SFmode:
38435 case V8HImode:
38436 case V8SImode:
38437 for (i = 0; i < nelt; ++i)
38438 mask |= (d->perm[i] >= nelt) << i;
38439 break;
38441 case V2DImode:
38442 for (i = 0; i < 2; ++i)
38443 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
38444 vmode = V8HImode;
38445 goto do_subreg;
38447 case V4SImode:
38448 for (i = 0; i < 4; ++i)
38449 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
38450 vmode = V8HImode;
38451 goto do_subreg;
38453 case V16QImode:
38454 /* See if bytes move in pairs so we can use pblendw with
38455 an immediate argument, rather than pblendvb with a vector
38456 argument. */
38457 for (i = 0; i < 16; i += 2)
38458 if (d->perm[i] + 1 != d->perm[i + 1])
38460 use_pblendvb:
38461 for (i = 0; i < nelt; ++i)
38462 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
38464 finish_pblendvb:
38465 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
38466 vperm = force_reg (vmode, vperm);
38468 if (GET_MODE_SIZE (vmode) == 16)
38469 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
38470 else
38471 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
38472 return true;
38475 for (i = 0; i < 8; ++i)
38476 mask |= (d->perm[i * 2] >= 16) << i;
38477 vmode = V8HImode;
38478 /* FALLTHRU */
38480 do_subreg:
38481 target = gen_lowpart (vmode, target);
38482 op0 = gen_lowpart (vmode, op0);
38483 op1 = gen_lowpart (vmode, op1);
38484 break;
38486 case V32QImode:
38487 /* See if bytes move in pairs. If not, vpblendvb must be used. */
38488 for (i = 0; i < 32; i += 2)
38489 if (d->perm[i] + 1 != d->perm[i + 1])
38490 goto use_pblendvb;
38491 /* See if bytes move in quadruplets. If yes, vpblendd
38492 with immediate can be used. */
38493 for (i = 0; i < 32; i += 4)
38494 if (d->perm[i] + 2 != d->perm[i + 2])
38495 break;
38496 if (i < 32)
38498 /* See if bytes move the same in both lanes. If yes,
38499 vpblendw with immediate can be used. */
38500 for (i = 0; i < 16; i += 2)
38501 if (d->perm[i] + 16 != d->perm[i + 16])
38502 goto use_pblendvb;
38504 /* Use vpblendw. */
38505 for (i = 0; i < 16; ++i)
38506 mask |= (d->perm[i * 2] >= 32) << i;
38507 vmode = V16HImode;
38508 goto do_subreg;
38511 /* Use vpblendd. */
38512 for (i = 0; i < 8; ++i)
38513 mask |= (d->perm[i * 4] >= 32) << i;
38514 vmode = V8SImode;
38515 goto do_subreg;
38517 case V16HImode:
38518 /* See if words move in pairs. If yes, vpblendd can be used. */
38519 for (i = 0; i < 16; i += 2)
38520 if (d->perm[i] + 1 != d->perm[i + 1])
38521 break;
38522 if (i < 16)
38524 /* See if words move the same in both lanes. If not,
38525 vpblendvb must be used. */
38526 for (i = 0; i < 8; i++)
38527 if (d->perm[i] + 8 != d->perm[i + 8])
38529 /* Use vpblendvb. */
38530 for (i = 0; i < 32; ++i)
38531 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
38533 vmode = V32QImode;
38534 nelt = 32;
38535 target = gen_lowpart (vmode, target);
38536 op0 = gen_lowpart (vmode, op0);
38537 op1 = gen_lowpart (vmode, op1);
38538 goto finish_pblendvb;
38541 /* Use vpblendw. */
38542 for (i = 0; i < 16; ++i)
38543 mask |= (d->perm[i] >= 16) << i;
38544 break;
38547 /* Use vpblendd. */
38548 for (i = 0; i < 8; ++i)
38549 mask |= (d->perm[i * 2] >= 16) << i;
38550 vmode = V8SImode;
38551 goto do_subreg;
38553 case V4DImode:
38554 /* Use vpblendd. */
38555 for (i = 0; i < 4; ++i)
38556 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
38557 vmode = V8SImode;
38558 goto do_subreg;
38560 default:
38561 gcc_unreachable ();
38564 /* This matches five different patterns with the different modes. */
38565 x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
38566 x = gen_rtx_SET (VOIDmode, target, x);
38567 emit_insn (x);
38569 return true;
38572 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
38573 in terms of the variable form of vpermilps.
38575 Note that we will have already failed the immediate input vpermilps,
38576 which requires that the high and low part shuffle be identical; the
38577 variable form doesn't require that. */
38579 static bool
38580 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
38582 rtx rperm[8], vperm;
38583 unsigned i;
38585 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
38586 return false;
38588 /* We can only permute within the 128-bit lane. */
38589 for (i = 0; i < 8; ++i)
38591 unsigned e = d->perm[i];
38592 if (i < 4 ? e >= 4 : e < 4)
38593 return false;
38596 if (d->testing_p)
38597 return true;
38599 for (i = 0; i < 8; ++i)
38601 unsigned e = d->perm[i];
38603 /* Within each 128-bit lane, the elements of op0 are numbered
38604 from 0 and the elements of op1 are numbered from 4. */
38605 if (e >= 8 + 4)
38606 e -= 8;
38607 else if (e >= 4)
38608 e -= 4;
38610 rperm[i] = GEN_INT (e);
38613 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
38614 vperm = force_reg (V8SImode, vperm);
38615 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
38617 return true;
38620 /* Return true if permutation D can be performed as VMODE permutation
38621 instead. */
38623 static bool
38624 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
38626 unsigned int i, j, chunk;
38628 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
38629 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
38630 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
38631 return false;
38633 if (GET_MODE_NUNITS (vmode) >= d->nelt)
38634 return true;
38636 chunk = d->nelt / GET_MODE_NUNITS (vmode);
38637 for (i = 0; i < d->nelt; i += chunk)
38638 if (d->perm[i] & (chunk - 1))
38639 return false;
38640 else
38641 for (j = 1; j < chunk; ++j)
38642 if (d->perm[i] + j != d->perm[i + j])
38643 return false;
38645 return true;
38648 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
38649 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
38651 static bool
38652 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
38654 unsigned i, nelt, eltsz, mask;
38655 unsigned char perm[32];
38656 enum machine_mode vmode = V16QImode;
38657 rtx rperm[32], vperm, target, op0, op1;
38659 nelt = d->nelt;
38661 if (!d->one_operand_p)
38663 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
38665 if (TARGET_AVX2
38666 && valid_perm_using_mode_p (V2TImode, d))
38668 if (d->testing_p)
38669 return true;
38671 /* Use vperm2i128 insn. The pattern uses
38672 V4DImode instead of V2TImode. */
38673 target = gen_lowpart (V4DImode, d->target);
38674 op0 = gen_lowpart (V4DImode, d->op0);
38675 op1 = gen_lowpart (V4DImode, d->op1);
38676 rperm[0]
38677 = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
38678 || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
38679 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
38680 return true;
38682 return false;
38685 else
38687 if (GET_MODE_SIZE (d->vmode) == 16)
38689 if (!TARGET_SSSE3)
38690 return false;
38692 else if (GET_MODE_SIZE (d->vmode) == 32)
38694 if (!TARGET_AVX2)
38695 return false;
38697 /* V4DImode should be already handled through
38698 expand_vselect by vpermq instruction. */
38699 gcc_assert (d->vmode != V4DImode);
38701 vmode = V32QImode;
38702 if (d->vmode == V8SImode
38703 || d->vmode == V16HImode
38704 || d->vmode == V32QImode)
38706 /* First see if vpermq can be used for
38707 V8SImode/V16HImode/V32QImode. */
38708 if (valid_perm_using_mode_p (V4DImode, d))
38710 for (i = 0; i < 4; i++)
38711 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
38712 if (d->testing_p)
38713 return true;
38714 return expand_vselect (gen_lowpart (V4DImode, d->target),
38715 gen_lowpart (V4DImode, d->op0),
38716 perm, 4, false);
38719 /* Next see if vpermd can be used. */
38720 if (valid_perm_using_mode_p (V8SImode, d))
38721 vmode = V8SImode;
38723 /* Or if vpermps can be used. */
38724 else if (d->vmode == V8SFmode)
38725 vmode = V8SImode;
38727 if (vmode == V32QImode)
38729 /* vpshufb only works intra lanes, it is not
38730 possible to shuffle bytes in between the lanes. */
38731 for (i = 0; i < nelt; ++i)
38732 if ((d->perm[i] ^ i) & (nelt / 2))
38733 return false;
38736 else
38737 return false;
38740 if (d->testing_p)
38741 return true;
38743 if (vmode == V8SImode)
38744 for (i = 0; i < 8; ++i)
38745 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
38746 else
38748 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
38749 if (!d->one_operand_p)
38750 mask = 2 * nelt - 1;
38751 else if (vmode == V16QImode)
38752 mask = nelt - 1;
38753 else
38754 mask = nelt / 2 - 1;
38756 for (i = 0; i < nelt; ++i)
38758 unsigned j, e = d->perm[i] & mask;
38759 for (j = 0; j < eltsz; ++j)
38760 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
38764 vperm = gen_rtx_CONST_VECTOR (vmode,
38765 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
38766 vperm = force_reg (vmode, vperm);
38768 target = gen_lowpart (vmode, d->target);
38769 op0 = gen_lowpart (vmode, d->op0);
38770 if (d->one_operand_p)
38772 if (vmode == V16QImode)
38773 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
38774 else if (vmode == V32QImode)
38775 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
38776 else if (vmode == V8SFmode)
38777 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
38778 else
38779 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
38781 else
38783 op1 = gen_lowpart (vmode, d->op1);
38784 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
38787 return true;
38790 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
38791 in a single instruction. */
38793 static bool
38794 expand_vec_perm_1 (struct expand_vec_perm_d *d)
38796 unsigned i, nelt = d->nelt;
38797 unsigned char perm2[MAX_VECT_LEN];
38799 /* Check plain VEC_SELECT first, because AVX has instructions that could
38800 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
38801 input where SEL+CONCAT may not. */
38802 if (d->one_operand_p)
38804 int mask = nelt - 1;
38805 bool identity_perm = true;
38806 bool broadcast_perm = true;
38808 for (i = 0; i < nelt; i++)
38810 perm2[i] = d->perm[i] & mask;
38811 if (perm2[i] != i)
38812 identity_perm = false;
38813 if (perm2[i])
38814 broadcast_perm = false;
38817 if (identity_perm)
38819 if (!d->testing_p)
38820 emit_move_insn (d->target, d->op0);
38821 return true;
38823 else if (broadcast_perm && TARGET_AVX2)
38825 /* Use vpbroadcast{b,w,d}. */
38826 rtx (*gen) (rtx, rtx) = NULL;
38827 switch (d->vmode)
38829 case V32QImode:
38830 gen = gen_avx2_pbroadcastv32qi_1;
38831 break;
38832 case V16HImode:
38833 gen = gen_avx2_pbroadcastv16hi_1;
38834 break;
38835 case V8SImode:
38836 gen = gen_avx2_pbroadcastv8si_1;
38837 break;
38838 case V16QImode:
38839 gen = gen_avx2_pbroadcastv16qi;
38840 break;
38841 case V8HImode:
38842 gen = gen_avx2_pbroadcastv8hi;
38843 break;
38844 case V8SFmode:
38845 gen = gen_avx2_vec_dupv8sf_1;
38846 break;
38847 /* For other modes prefer other shuffles this function creates. */
38848 default: break;
38850 if (gen != NULL)
38852 if (!d->testing_p)
38853 emit_insn (gen (d->target, d->op0));
38854 return true;
38858 if (expand_vselect (d->target, d->op0, perm2, nelt, d->testing_p))
38859 return true;
38861 /* There are plenty of patterns in sse.md that are written for
38862 SEL+CONCAT and are not replicated for a single op. Perhaps
38863 that should be changed, to avoid the nastiness here. */
38865 /* Recognize interleave style patterns, which means incrementing
38866 every other permutation operand. */
38867 for (i = 0; i < nelt; i += 2)
38869 perm2[i] = d->perm[i] & mask;
38870 perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
38872 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
38873 d->testing_p))
38874 return true;
38876 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
38877 if (nelt >= 4)
38879 for (i = 0; i < nelt; i += 4)
38881 perm2[i + 0] = d->perm[i + 0] & mask;
38882 perm2[i + 1] = d->perm[i + 1] & mask;
38883 perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
38884 perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
38887 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
38888 d->testing_p))
38889 return true;
38893 /* Finally, try the fully general two operand permute. */
38894 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
38895 d->testing_p))
38896 return true;
38898 /* Recognize interleave style patterns with reversed operands. */
38899 if (!d->one_operand_p)
38901 for (i = 0; i < nelt; ++i)
38903 unsigned e = d->perm[i];
38904 if (e >= nelt)
38905 e -= nelt;
38906 else
38907 e += nelt;
38908 perm2[i] = e;
38911 if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt,
38912 d->testing_p))
38913 return true;
38916 /* Try the SSE4.1 blend variable merge instructions. */
38917 if (expand_vec_perm_blend (d))
38918 return true;
38920 /* Try one of the AVX vpermil variable permutations. */
38921 if (expand_vec_perm_vpermil (d))
38922 return true;
38924 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
38925 vpshufb, vpermd, vpermps or vpermq variable permutation. */
38926 if (expand_vec_perm_pshufb (d))
38927 return true;
38929 return false;
38932 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
38933 in terms of a pair of pshuflw + pshufhw instructions. */
38935 static bool
38936 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
38938 unsigned char perm2[MAX_VECT_LEN];
38939 unsigned i;
38940 bool ok;
38942 if (d->vmode != V8HImode || !d->one_operand_p)
38943 return false;
38945 /* The two permutations only operate in 64-bit lanes. */
38946 for (i = 0; i < 4; ++i)
38947 if (d->perm[i] >= 4)
38948 return false;
38949 for (i = 4; i < 8; ++i)
38950 if (d->perm[i] < 4)
38951 return false;
38953 if (d->testing_p)
38954 return true;
38956 /* Emit the pshuflw. */
38957 memcpy (perm2, d->perm, 4);
38958 for (i = 4; i < 8; ++i)
38959 perm2[i] = i;
38960 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
38961 gcc_assert (ok);
38963 /* Emit the pshufhw. */
38964 memcpy (perm2 + 4, d->perm + 4, 4);
38965 for (i = 0; i < 4; ++i)
38966 perm2[i] = i;
38967 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
38968 gcc_assert (ok);
38970 return true;
38973 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
38974 the permutation using the SSSE3 palignr instruction. This succeeds
38975 when all of the elements in PERM fit within one vector and we merely
38976 need to shift them down so that a single vector permutation has a
38977 chance to succeed. */
38979 static bool
38980 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
38982 unsigned i, nelt = d->nelt;
38983 unsigned min, max;
38984 bool in_order, ok;
38985 rtx shift;
38987 /* Even with AVX, palignr only operates on 128-bit vectors. */
38988 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
38989 return false;
38991 min = nelt, max = 0;
38992 for (i = 0; i < nelt; ++i)
38994 unsigned e = d->perm[i];
38995 if (e < min)
38996 min = e;
38997 if (e > max)
38998 max = e;
39000 if (min == 0 || max - min >= nelt)
39001 return false;
39003 /* Given that we have SSSE3, we know we'll be able to implement the
39004 single operand permutation after the palignr with pshufb. */
39005 if (d->testing_p)
39006 return true;
39008 shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
39009 emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target),
39010 gen_lowpart (TImode, d->op1),
39011 gen_lowpart (TImode, d->op0), shift));
39013 d->op0 = d->op1 = d->target;
39014 d->one_operand_p = true;
39016 in_order = true;
39017 for (i = 0; i < nelt; ++i)
39019 unsigned e = d->perm[i] - min;
39020 if (e != i)
39021 in_order = false;
39022 d->perm[i] = e;
39025 /* Test for the degenerate case where the alignment by itself
39026 produces the desired permutation. */
39027 if (in_order)
39028 return true;
39030 ok = expand_vec_perm_1 (d);
39031 gcc_assert (ok);
39033 return ok;
39036 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
39038 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
39039 a two vector permutation into a single vector permutation by using
39040 an interleave operation to merge the vectors. */
39042 static bool
39043 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
39045 struct expand_vec_perm_d dremap, dfinal;
39046 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
39047 unsigned HOST_WIDE_INT contents;
39048 unsigned char remap[2 * MAX_VECT_LEN];
39049 rtx seq;
39050 bool ok, same_halves = false;
39052 if (GET_MODE_SIZE (d->vmode) == 16)
39054 if (d->one_operand_p)
39055 return false;
39057 else if (GET_MODE_SIZE (d->vmode) == 32)
39059 if (!TARGET_AVX)
39060 return false;
39061 /* For 32-byte modes allow even d->one_operand_p.
39062 The lack of cross-lane shuffling in some instructions
39063 might prevent a single insn shuffle. */
39064 dfinal = *d;
39065 dfinal.testing_p = true;
39066 /* If expand_vec_perm_interleave3 can expand this into
39067 a 3 insn sequence, give up and let it be expanded as
39068 3 insn sequence. While that is one insn longer,
39069 it doesn't need a memory operand and in the common
39070 case that both interleave low and high permutations
39071 with the same operands are adjacent needs 4 insns
39072 for both after CSE. */
39073 if (expand_vec_perm_interleave3 (&dfinal))
39074 return false;
39076 else
39077 return false;
39079 /* Examine from whence the elements come. */
39080 contents = 0;
39081 for (i = 0; i < nelt; ++i)
39082 contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
39084 memset (remap, 0xff, sizeof (remap));
39085 dremap = *d;
39087 if (GET_MODE_SIZE (d->vmode) == 16)
39089 unsigned HOST_WIDE_INT h1, h2, h3, h4;
39091 /* Split the two input vectors into 4 halves. */
39092 h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
39093 h2 = h1 << nelt2;
39094 h3 = h2 << nelt2;
39095 h4 = h3 << nelt2;
39097 /* If the elements from the low halves use interleave low, and similarly
39098 for interleave high. If the elements are from mis-matched halves, we
39099 can use shufps for V4SF/V4SI or do a DImode shuffle. */
39100 if ((contents & (h1 | h3)) == contents)
39102 /* punpckl* */
39103 for (i = 0; i < nelt2; ++i)
39105 remap[i] = i * 2;
39106 remap[i + nelt] = i * 2 + 1;
39107 dremap.perm[i * 2] = i;
39108 dremap.perm[i * 2 + 1] = i + nelt;
39110 if (!TARGET_SSE2 && d->vmode == V4SImode)
39111 dremap.vmode = V4SFmode;
39113 else if ((contents & (h2 | h4)) == contents)
39115 /* punpckh* */
39116 for (i = 0; i < nelt2; ++i)
39118 remap[i + nelt2] = i * 2;
39119 remap[i + nelt + nelt2] = i * 2 + 1;
39120 dremap.perm[i * 2] = i + nelt2;
39121 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
39123 if (!TARGET_SSE2 && d->vmode == V4SImode)
39124 dremap.vmode = V4SFmode;
39126 else if ((contents & (h1 | h4)) == contents)
39128 /* shufps */
39129 for (i = 0; i < nelt2; ++i)
39131 remap[i] = i;
39132 remap[i + nelt + nelt2] = i + nelt2;
39133 dremap.perm[i] = i;
39134 dremap.perm[i + nelt2] = i + nelt + nelt2;
39136 if (nelt != 4)
39138 /* shufpd */
39139 dremap.vmode = V2DImode;
39140 dremap.nelt = 2;
39141 dremap.perm[0] = 0;
39142 dremap.perm[1] = 3;
39145 else if ((contents & (h2 | h3)) == contents)
39147 /* shufps */
39148 for (i = 0; i < nelt2; ++i)
39150 remap[i + nelt2] = i;
39151 remap[i + nelt] = i + nelt2;
39152 dremap.perm[i] = i + nelt2;
39153 dremap.perm[i + nelt2] = i + nelt;
39155 if (nelt != 4)
39157 /* shufpd */
39158 dremap.vmode = V2DImode;
39159 dremap.nelt = 2;
39160 dremap.perm[0] = 1;
39161 dremap.perm[1] = 2;
39164 else
39165 return false;
39167 else
39169 unsigned int nelt4 = nelt / 4, nzcnt = 0;
39170 unsigned HOST_WIDE_INT q[8];
39171 unsigned int nonzero_halves[4];
39173 /* Split the two input vectors into 8 quarters. */
39174 q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
39175 for (i = 1; i < 8; ++i)
39176 q[i] = q[0] << (nelt4 * i);
39177 for (i = 0; i < 4; ++i)
39178 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
39180 nonzero_halves[nzcnt] = i;
39181 ++nzcnt;
39184 if (nzcnt == 1)
39186 gcc_assert (d->one_operand_p);
39187 nonzero_halves[1] = nonzero_halves[0];
39188 same_halves = true;
39190 else if (d->one_operand_p)
39192 gcc_assert (nonzero_halves[0] == 0);
39193 gcc_assert (nonzero_halves[1] == 1);
39196 if (nzcnt <= 2)
39198 if (d->perm[0] / nelt2 == nonzero_halves[1])
39200 /* Attempt to increase the likelihood that dfinal
39201 shuffle will be intra-lane. */
39202 char tmph = nonzero_halves[0];
39203 nonzero_halves[0] = nonzero_halves[1];
39204 nonzero_halves[1] = tmph;
39207 /* vperm2f128 or vperm2i128. */
39208 for (i = 0; i < nelt2; ++i)
39210 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
39211 remap[i + nonzero_halves[0] * nelt2] = i;
39212 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
39213 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
39216 if (d->vmode != V8SFmode
39217 && d->vmode != V4DFmode
39218 && d->vmode != V8SImode)
39220 dremap.vmode = V8SImode;
39221 dremap.nelt = 8;
39222 for (i = 0; i < 4; ++i)
39224 dremap.perm[i] = i + nonzero_halves[0] * 4;
39225 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
39229 else if (d->one_operand_p)
39230 return false;
39231 else if (TARGET_AVX2
39232 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
39234 /* vpunpckl* */
39235 for (i = 0; i < nelt4; ++i)
39237 remap[i] = i * 2;
39238 remap[i + nelt] = i * 2 + 1;
39239 remap[i + nelt2] = i * 2 + nelt2;
39240 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
39241 dremap.perm[i * 2] = i;
39242 dremap.perm[i * 2 + 1] = i + nelt;
39243 dremap.perm[i * 2 + nelt2] = i + nelt2;
39244 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
39247 else if (TARGET_AVX2
39248 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
39250 /* vpunpckh* */
39251 for (i = 0; i < nelt4; ++i)
39253 remap[i + nelt4] = i * 2;
39254 remap[i + nelt + nelt4] = i * 2 + 1;
39255 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
39256 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
39257 dremap.perm[i * 2] = i + nelt4;
39258 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
39259 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
39260 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
39263 else
39264 return false;
39267 /* Use the remapping array set up above to move the elements from their
39268 swizzled locations into their final destinations. */
39269 dfinal = *d;
39270 for (i = 0; i < nelt; ++i)
39272 unsigned e = remap[d->perm[i]];
39273 gcc_assert (e < nelt);
39274 /* If same_halves is true, both halves of the remapped vector are the
39275 same. Avoid cross-lane accesses if possible. */
39276 if (same_halves && i >= nelt2)
39278 gcc_assert (e < nelt2);
39279 dfinal.perm[i] = e + nelt2;
39281 else
39282 dfinal.perm[i] = e;
39284 dfinal.op0 = gen_reg_rtx (dfinal.vmode);
39285 dfinal.op1 = dfinal.op0;
39286 dfinal.one_operand_p = true;
39287 dremap.target = dfinal.op0;
39289 /* Test if the final remap can be done with a single insn. For V4SFmode or
39290 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
39291 start_sequence ();
39292 ok = expand_vec_perm_1 (&dfinal);
39293 seq = get_insns ();
39294 end_sequence ();
39296 if (!ok)
39297 return false;
39299 if (d->testing_p)
39300 return true;
39302 if (dremap.vmode != dfinal.vmode)
39304 dremap.target = gen_lowpart (dremap.vmode, dremap.target);
39305 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
39306 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
39309 ok = expand_vec_perm_1 (&dremap);
39310 gcc_assert (ok);
39312 emit_insn (seq);
39313 return true;
39316 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
39317 a single vector cross-lane permutation into vpermq followed
39318 by any of the single insn permutations. */
39320 static bool
39321 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
39323 struct expand_vec_perm_d dremap, dfinal;
39324 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
39325 unsigned contents[2];
39326 bool ok;
39328 if (!(TARGET_AVX2
39329 && (d->vmode == V32QImode || d->vmode == V16HImode)
39330 && d->one_operand_p))
39331 return false;
39333 contents[0] = 0;
39334 contents[1] = 0;
39335 for (i = 0; i < nelt2; ++i)
39337 contents[0] |= 1u << (d->perm[i] / nelt4);
39338 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
39341 for (i = 0; i < 2; ++i)
39343 unsigned int cnt = 0;
39344 for (j = 0; j < 4; ++j)
39345 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
39346 return false;
39349 if (d->testing_p)
39350 return true;
39352 dremap = *d;
39353 dremap.vmode = V4DImode;
39354 dremap.nelt = 4;
39355 dremap.target = gen_reg_rtx (V4DImode);
39356 dremap.op0 = gen_lowpart (V4DImode, d->op0);
39357 dremap.op1 = dremap.op0;
39358 dremap.one_operand_p = true;
39359 for (i = 0; i < 2; ++i)
39361 unsigned int cnt = 0;
39362 for (j = 0; j < 4; ++j)
39363 if ((contents[i] & (1u << j)) != 0)
39364 dremap.perm[2 * i + cnt++] = j;
39365 for (; cnt < 2; ++cnt)
39366 dremap.perm[2 * i + cnt] = 0;
39369 dfinal = *d;
39370 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
39371 dfinal.op1 = dfinal.op0;
39372 dfinal.one_operand_p = true;
39373 for (i = 0, j = 0; i < nelt; ++i)
39375 if (i == nelt2)
39376 j = 2;
39377 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
39378 if ((d->perm[i] / nelt4) == dremap.perm[j])
39380 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
39381 dfinal.perm[i] |= nelt4;
39382 else
39383 gcc_unreachable ();
39386 ok = expand_vec_perm_1 (&dremap);
39387 gcc_assert (ok);
39389 ok = expand_vec_perm_1 (&dfinal);
39390 gcc_assert (ok);
39392 return true;
39395 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
39396 a vector permutation using two instructions, vperm2f128 resp.
39397 vperm2i128 followed by any single in-lane permutation. */
39399 static bool
39400 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
39402 struct expand_vec_perm_d dfirst, dsecond;
39403 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
39404 bool ok;
39406 if (!TARGET_AVX
39407 || GET_MODE_SIZE (d->vmode) != 32
39408 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
39409 return false;
39411 dsecond = *d;
39412 dsecond.one_operand_p = false;
39413 dsecond.testing_p = true;
39415 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
39416 immediate. For perm < 16 the second permutation uses
39417 d->op0 as first operand, for perm >= 16 it uses d->op1
39418 as first operand. The second operand is the result of
39419 vperm2[fi]128. */
39420 for (perm = 0; perm < 32; perm++)
39422 /* Ignore permutations which do not move anything cross-lane. */
39423 if (perm < 16)
39425 /* The second shuffle for e.g. V4DFmode has
39426 0123 and ABCD operands.
39427 Ignore AB23, as 23 is already in the second lane
39428 of the first operand. */
39429 if ((perm & 0xc) == (1 << 2)) continue;
39430 /* And 01CD, as 01 is in the first lane of the first
39431 operand. */
39432 if ((perm & 3) == 0) continue;
39433 /* And 4567, as then the vperm2[fi]128 doesn't change
39434 anything on the original 4567 second operand. */
39435 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
39437 else
39439 /* The second shuffle for e.g. V4DFmode has
39440 4567 and ABCD operands.
39441 Ignore AB67, as 67 is already in the second lane
39442 of the first operand. */
39443 if ((perm & 0xc) == (3 << 2)) continue;
39444 /* And 45CD, as 45 is in the first lane of the first
39445 operand. */
39446 if ((perm & 3) == 2) continue;
39447 /* And 0123, as then the vperm2[fi]128 doesn't change
39448 anything on the original 0123 first operand. */
39449 if ((perm & 0xf) == (1 << 2)) continue;
39452 for (i = 0; i < nelt; i++)
39454 j = d->perm[i] / nelt2;
39455 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
39456 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
39457 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
39458 dsecond.perm[i] = d->perm[i] & (nelt - 1);
39459 else
39460 break;
39463 if (i == nelt)
39465 start_sequence ();
39466 ok = expand_vec_perm_1 (&dsecond);
39467 end_sequence ();
39469 else
39470 ok = false;
39472 if (ok)
39474 if (d->testing_p)
39475 return true;
39477 /* Found a usable second shuffle. dfirst will be
39478 vperm2f128 on d->op0 and d->op1. */
39479 dsecond.testing_p = false;
39480 dfirst = *d;
39481 dfirst.target = gen_reg_rtx (d->vmode);
39482 for (i = 0; i < nelt; i++)
39483 dfirst.perm[i] = (i & (nelt2 - 1))
39484 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
39486 ok = expand_vec_perm_1 (&dfirst);
39487 gcc_assert (ok);
39489 /* And dsecond is some single insn shuffle, taking
39490 d->op0 and result of vperm2f128 (if perm < 16) or
39491 d->op1 and result of vperm2f128 (otherwise). */
39492 dsecond.op1 = dfirst.target;
39493 if (perm >= 16)
39494 dsecond.op0 = dfirst.op1;
39496 ok = expand_vec_perm_1 (&dsecond);
39497 gcc_assert (ok);
39499 return true;
39502 /* For one operand, the only useful vperm2f128 permutation is 0x10. */
39503 if (d->one_operand_p)
39504 return false;
39507 return false;
39510 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
39511 a two vector permutation using 2 intra-lane interleave insns
39512 and cross-lane shuffle for 32-byte vectors. */
39514 static bool
39515 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
39517 unsigned i, nelt;
39518 rtx (*gen) (rtx, rtx, rtx);
39520 if (d->one_operand_p)
39521 return false;
39522 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
39524 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
39526 else
39527 return false;
39529 nelt = d->nelt;
39530 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
39531 return false;
39532 for (i = 0; i < nelt; i += 2)
39533 if (d->perm[i] != d->perm[0] + i / 2
39534 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
39535 return false;
39537 if (d->testing_p)
39538 return true;
39540 switch (d->vmode)
39542 case V32QImode:
39543 if (d->perm[0])
39544 gen = gen_vec_interleave_highv32qi;
39545 else
39546 gen = gen_vec_interleave_lowv32qi;
39547 break;
39548 case V16HImode:
39549 if (d->perm[0])
39550 gen = gen_vec_interleave_highv16hi;
39551 else
39552 gen = gen_vec_interleave_lowv16hi;
39553 break;
39554 case V8SImode:
39555 if (d->perm[0])
39556 gen = gen_vec_interleave_highv8si;
39557 else
39558 gen = gen_vec_interleave_lowv8si;
39559 break;
39560 case V4DImode:
39561 if (d->perm[0])
39562 gen = gen_vec_interleave_highv4di;
39563 else
39564 gen = gen_vec_interleave_lowv4di;
39565 break;
39566 case V8SFmode:
39567 if (d->perm[0])
39568 gen = gen_vec_interleave_highv8sf;
39569 else
39570 gen = gen_vec_interleave_lowv8sf;
39571 break;
39572 case V4DFmode:
39573 if (d->perm[0])
39574 gen = gen_vec_interleave_highv4df;
39575 else
39576 gen = gen_vec_interleave_lowv4df;
39577 break;
39578 default:
39579 gcc_unreachable ();
39582 emit_insn (gen (d->target, d->op0, d->op1));
39583 return true;
39586 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
39587 a single vector permutation using a single intra-lane vector
39588 permutation, vperm2f128 swapping the lanes and vblend* insn blending
39589 the non-swapped and swapped vectors together. */
39591 static bool
39592 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
39594 struct expand_vec_perm_d dfirst, dsecond;
39595 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
39596 rtx seq;
39597 bool ok;
39598 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
39600 if (!TARGET_AVX
39601 || TARGET_AVX2
39602 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
39603 || !d->one_operand_p)
39604 return false;
39606 dfirst = *d;
39607 for (i = 0; i < nelt; i++)
39608 dfirst.perm[i] = 0xff;
39609 for (i = 0, msk = 0; i < nelt; i++)
39611 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
39612 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
39613 return false;
39614 dfirst.perm[j] = d->perm[i];
39615 if (j != i)
39616 msk |= (1 << i);
39618 for (i = 0; i < nelt; i++)
39619 if (dfirst.perm[i] == 0xff)
39620 dfirst.perm[i] = i;
39622 if (!d->testing_p)
39623 dfirst.target = gen_reg_rtx (dfirst.vmode);
39625 start_sequence ();
39626 ok = expand_vec_perm_1 (&dfirst);
39627 seq = get_insns ();
39628 end_sequence ();
39630 if (!ok)
39631 return false;
39633 if (d->testing_p)
39634 return true;
39636 emit_insn (seq);
39638 dsecond = *d;
39639 dsecond.op0 = dfirst.target;
39640 dsecond.op1 = dfirst.target;
39641 dsecond.one_operand_p = true;
39642 dsecond.target = gen_reg_rtx (dsecond.vmode);
39643 for (i = 0; i < nelt; i++)
39644 dsecond.perm[i] = i ^ nelt2;
39646 ok = expand_vec_perm_1 (&dsecond);
39647 gcc_assert (ok);
39649 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
39650 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
39651 return true;
39654 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
39655 permutation using two vperm2f128, followed by a vshufpd insn blending
39656 the two vectors together. */
39658 static bool
39659 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
39661 struct expand_vec_perm_d dfirst, dsecond, dthird;
39662 bool ok;
39664 if (!TARGET_AVX || (d->vmode != V4DFmode))
39665 return false;
39667 if (d->testing_p)
39668 return true;
39670 dfirst = *d;
39671 dsecond = *d;
39672 dthird = *d;
39674 dfirst.perm[0] = (d->perm[0] & ~1);
39675 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
39676 dfirst.perm[2] = (d->perm[2] & ~1);
39677 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
39678 dsecond.perm[0] = (d->perm[1] & ~1);
39679 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
39680 dsecond.perm[2] = (d->perm[3] & ~1);
39681 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
39682 dthird.perm[0] = (d->perm[0] % 2);
39683 dthird.perm[1] = (d->perm[1] % 2) + 4;
39684 dthird.perm[2] = (d->perm[2] % 2) + 2;
39685 dthird.perm[3] = (d->perm[3] % 2) + 6;
39687 dfirst.target = gen_reg_rtx (dfirst.vmode);
39688 dsecond.target = gen_reg_rtx (dsecond.vmode);
39689 dthird.op0 = dfirst.target;
39690 dthird.op1 = dsecond.target;
39691 dthird.one_operand_p = false;
39693 canonicalize_perm (&dfirst);
39694 canonicalize_perm (&dsecond);
39696 ok = expand_vec_perm_1 (&dfirst)
39697 && expand_vec_perm_1 (&dsecond)
39698 && expand_vec_perm_1 (&dthird);
39700 gcc_assert (ok);
39702 return true;
39705 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
39706 permutation with two pshufb insns and an ior. We should have already
39707 failed all two instruction sequences. */
39709 static bool
39710 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
39712 rtx rperm[2][16], vperm, l, h, op, m128;
39713 unsigned int i, nelt, eltsz;
39715 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
39716 return false;
39717 gcc_assert (!d->one_operand_p);
39719 nelt = d->nelt;
39720 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
39722 /* Generate two permutation masks. If the required element is within
39723 the given vector it is shuffled into the proper lane. If the required
39724 element is in the other vector, force a zero into the lane by setting
39725 bit 7 in the permutation mask. */
39726 m128 = GEN_INT (-128);
39727 for (i = 0; i < nelt; ++i)
39729 unsigned j, e = d->perm[i];
39730 unsigned which = (e >= nelt);
39731 if (e >= nelt)
39732 e -= nelt;
39734 for (j = 0; j < eltsz; ++j)
39736 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
39737 rperm[1-which][i*eltsz + j] = m128;
39741 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
39742 vperm = force_reg (V16QImode, vperm);
39744 l = gen_reg_rtx (V16QImode);
39745 op = gen_lowpart (V16QImode, d->op0);
39746 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
39748 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
39749 vperm = force_reg (V16QImode, vperm);
39751 h = gen_reg_rtx (V16QImode);
39752 op = gen_lowpart (V16QImode, d->op1);
39753 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
39755 op = gen_lowpart (V16QImode, d->target);
39756 emit_insn (gen_iorv16qi3 (op, l, h));
39758 return true;
39761 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
39762 with two vpshufb insns, vpermq and vpor. We should have already failed
39763 all two or three instruction sequences. */
39765 static bool
39766 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
39768 rtx rperm[2][32], vperm, l, h, hp, op, m128;
39769 unsigned int i, nelt, eltsz;
39771 if (!TARGET_AVX2
39772 || !d->one_operand_p
39773 || (d->vmode != V32QImode && d->vmode != V16HImode))
39774 return false;
39776 if (d->testing_p)
39777 return true;
39779 nelt = d->nelt;
39780 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
39782 /* Generate two permutation masks. If the required element is within
39783 the same lane, it is shuffled in. If the required element from the
39784 other lane, force a zero by setting bit 7 in the permutation mask.
39785 In the other mask the mask has non-negative elements if element
39786 is requested from the other lane, but also moved to the other lane,
39787 so that the result of vpshufb can have the two V2TImode halves
39788 swapped. */
39789 m128 = GEN_INT (-128);
39790 for (i = 0; i < nelt; ++i)
39792 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
39793 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
39795 for (j = 0; j < eltsz; ++j)
39797 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
39798 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
39802 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
39803 vperm = force_reg (V32QImode, vperm);
39805 h = gen_reg_rtx (V32QImode);
39806 op = gen_lowpart (V32QImode, d->op0);
39807 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
39809 /* Swap the 128-byte lanes of h into hp. */
39810 hp = gen_reg_rtx (V4DImode);
39811 op = gen_lowpart (V4DImode, h);
39812 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
39813 const1_rtx));
39815 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
39816 vperm = force_reg (V32QImode, vperm);
39818 l = gen_reg_rtx (V32QImode);
39819 op = gen_lowpart (V32QImode, d->op0);
39820 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
39822 op = gen_lowpart (V32QImode, d->target);
39823 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
39825 return true;
39828 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
39829 and extract-odd permutations of two V32QImode and V16QImode operand
39830 with two vpshufb insns, vpor and vpermq. We should have already
39831 failed all two or three instruction sequences. */
39833 static bool
39834 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
39836 rtx rperm[2][32], vperm, l, h, ior, op, m128;
39837 unsigned int i, nelt, eltsz;
39839 if (!TARGET_AVX2
39840 || d->one_operand_p
39841 || (d->vmode != V32QImode && d->vmode != V16HImode))
39842 return false;
39844 for (i = 0; i < d->nelt; ++i)
39845 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
39846 return false;
39848 if (d->testing_p)
39849 return true;
39851 nelt = d->nelt;
39852 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
39854 /* Generate two permutation masks. In the first permutation mask
39855 the first quarter will contain indexes for the first half
39856 of the op0, the second quarter will contain bit 7 set, third quarter
39857 will contain indexes for the second half of the op0 and the
39858 last quarter bit 7 set. In the second permutation mask
39859 the first quarter will contain bit 7 set, the second quarter
39860 indexes for the first half of the op1, the third quarter bit 7 set
39861 and last quarter indexes for the second half of the op1.
39862 I.e. the first mask e.g. for V32QImode extract even will be:
39863 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
39864 (all values masked with 0xf except for -128) and second mask
39865 for extract even will be
39866 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
39867 m128 = GEN_INT (-128);
39868 for (i = 0; i < nelt; ++i)
39870 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
39871 unsigned which = d->perm[i] >= nelt;
39872 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
39874 for (j = 0; j < eltsz; ++j)
39876 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
39877 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
39881 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
39882 vperm = force_reg (V32QImode, vperm);
39884 l = gen_reg_rtx (V32QImode);
39885 op = gen_lowpart (V32QImode, d->op0);
39886 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
39888 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
39889 vperm = force_reg (V32QImode, vperm);
39891 h = gen_reg_rtx (V32QImode);
39892 op = gen_lowpart (V32QImode, d->op1);
39893 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
39895 ior = gen_reg_rtx (V32QImode);
39896 emit_insn (gen_iorv32qi3 (ior, l, h));
39898 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
39899 op = gen_lowpart (V4DImode, d->target);
39900 ior = gen_lowpart (V4DImode, ior);
39901 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
39902 const1_rtx, GEN_INT (3)));
39904 return true;
39907 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
39908 and extract-odd permutations. */
39910 static bool
39911 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
39913 rtx t1, t2, t3;
39915 switch (d->vmode)
39917 case V4DFmode:
39918 t1 = gen_reg_rtx (V4DFmode);
39919 t2 = gen_reg_rtx (V4DFmode);
39921 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
39922 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
39923 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
39925 /* Now an unpck[lh]pd will produce the result required. */
39926 if (odd)
39927 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
39928 else
39929 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
39930 emit_insn (t3);
39931 break;
39933 case V8SFmode:
39935 int mask = odd ? 0xdd : 0x88;
39937 t1 = gen_reg_rtx (V8SFmode);
39938 t2 = gen_reg_rtx (V8SFmode);
39939 t3 = gen_reg_rtx (V8SFmode);
39941 /* Shuffle within the 128-bit lanes to produce:
39942 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
39943 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
39944 GEN_INT (mask)));
39946 /* Shuffle the lanes around to produce:
39947 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
39948 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
39949 GEN_INT (0x3)));
39951 /* Shuffle within the 128-bit lanes to produce:
39952 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
39953 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
39955 /* Shuffle within the 128-bit lanes to produce:
39956 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
39957 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
39959 /* Shuffle the lanes around to produce:
39960 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
39961 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
39962 GEN_INT (0x20)));
39964 break;
39966 case V2DFmode:
39967 case V4SFmode:
39968 case V2DImode:
39969 case V4SImode:
39970 /* These are always directly implementable by expand_vec_perm_1. */
39971 gcc_unreachable ();
39973 case V8HImode:
39974 if (TARGET_SSSE3)
39975 return expand_vec_perm_pshufb2 (d);
39976 else
39978 /* We need 2*log2(N)-1 operations to achieve odd/even
39979 with interleave. */
39980 t1 = gen_reg_rtx (V8HImode);
39981 t2 = gen_reg_rtx (V8HImode);
39982 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
39983 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
39984 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
39985 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
39986 if (odd)
39987 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
39988 else
39989 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
39990 emit_insn (t3);
39992 break;
39994 case V16QImode:
39995 if (TARGET_SSSE3)
39996 return expand_vec_perm_pshufb2 (d);
39997 else
39999 t1 = gen_reg_rtx (V16QImode);
40000 t2 = gen_reg_rtx (V16QImode);
40001 t3 = gen_reg_rtx (V16QImode);
40002 emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
40003 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
40004 emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
40005 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
40006 emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
40007 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
40008 if (odd)
40009 t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
40010 else
40011 t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
40012 emit_insn (t3);
40014 break;
40016 case V16HImode:
40017 case V32QImode:
40018 return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
40020 case V4DImode:
40021 if (!TARGET_AVX2)
40023 struct expand_vec_perm_d d_copy = *d;
40024 d_copy.vmode = V4DFmode;
40025 d_copy.target = gen_lowpart (V4DFmode, d->target);
40026 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
40027 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
40028 return expand_vec_perm_even_odd_1 (&d_copy, odd);
40031 t1 = gen_reg_rtx (V4DImode);
40032 t2 = gen_reg_rtx (V4DImode);
40034 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
40035 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
40036 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
40038 /* Now an vpunpck[lh]qdq will produce the result required. */
40039 if (odd)
40040 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
40041 else
40042 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
40043 emit_insn (t3);
40044 break;
40046 case V8SImode:
40047 if (!TARGET_AVX2)
40049 struct expand_vec_perm_d d_copy = *d;
40050 d_copy.vmode = V8SFmode;
40051 d_copy.target = gen_lowpart (V8SFmode, d->target);
40052 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
40053 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
40054 return expand_vec_perm_even_odd_1 (&d_copy, odd);
40057 t1 = gen_reg_rtx (V8SImode);
40058 t2 = gen_reg_rtx (V8SImode);
40060 /* Shuffle the lanes around into
40061 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
40062 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1),
40063 gen_lowpart (V4DImode, d->op0),
40064 gen_lowpart (V4DImode, d->op1),
40065 GEN_INT (0x20)));
40066 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2),
40067 gen_lowpart (V4DImode, d->op0),
40068 gen_lowpart (V4DImode, d->op1),
40069 GEN_INT (0x31)));
40071 /* Swap the 2nd and 3rd position in each lane into
40072 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
40073 emit_insn (gen_avx2_pshufdv3 (t1, t1,
40074 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
40075 emit_insn (gen_avx2_pshufdv3 (t2, t2,
40076 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
40078 /* Now an vpunpck[lh]qdq will produce
40079 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
40080 if (odd)
40081 t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target),
40082 gen_lowpart (V4DImode, t1),
40083 gen_lowpart (V4DImode, t2));
40084 else
40085 t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target),
40086 gen_lowpart (V4DImode, t1),
40087 gen_lowpart (V4DImode, t2));
40088 emit_insn (t3);
40089 break;
40091 default:
40092 gcc_unreachable ();
40095 return true;
40098 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
40099 extract-even and extract-odd permutations. */
40101 static bool
40102 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
40104 unsigned i, odd, nelt = d->nelt;
40106 odd = d->perm[0];
40107 if (odd != 0 && odd != 1)
40108 return false;
40110 for (i = 1; i < nelt; ++i)
40111 if (d->perm[i] != 2 * i + odd)
40112 return false;
40114 return expand_vec_perm_even_odd_1 (d, odd);
40117 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
40118 permutations. We assume that expand_vec_perm_1 has already failed. */
40120 static bool
40121 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
40123 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
40124 enum machine_mode vmode = d->vmode;
40125 unsigned char perm2[4];
40126 rtx op0 = d->op0;
40127 bool ok;
40129 switch (vmode)
40131 case V4DFmode:
40132 case V8SFmode:
40133 /* These are special-cased in sse.md so that we can optionally
40134 use the vbroadcast instruction. They expand to two insns
40135 if the input happens to be in a register. */
40136 gcc_unreachable ();
40138 case V2DFmode:
40139 case V2DImode:
40140 case V4SFmode:
40141 case V4SImode:
40142 /* These are always implementable using standard shuffle patterns. */
40143 gcc_unreachable ();
40145 case V8HImode:
40146 case V16QImode:
40147 /* These can be implemented via interleave. We save one insn by
40148 stopping once we have promoted to V4SImode and then use pshufd. */
40151 rtx dest;
40152 rtx (*gen) (rtx, rtx, rtx)
40153 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
40154 : gen_vec_interleave_lowv8hi;
40156 if (elt >= nelt2)
40158 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
40159 : gen_vec_interleave_highv8hi;
40160 elt -= nelt2;
40162 nelt2 /= 2;
40164 dest = gen_reg_rtx (vmode);
40165 emit_insn (gen (dest, op0, op0));
40166 vmode = get_mode_wider_vector (vmode);
40167 op0 = gen_lowpart (vmode, dest);
40169 while (vmode != V4SImode);
40171 memset (perm2, elt, 4);
40172 ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4,
40173 d->testing_p);
40174 gcc_assert (ok);
40175 return true;
40177 case V32QImode:
40178 case V16HImode:
40179 case V8SImode:
40180 case V4DImode:
40181 /* For AVX2 broadcasts of the first element vpbroadcast* or
40182 vpermq should be used by expand_vec_perm_1. */
40183 gcc_assert (!TARGET_AVX2 || d->perm[0]);
40184 return false;
40186 default:
40187 gcc_unreachable ();
40191 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
40192 broadcast permutations. */
40194 static bool
40195 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
40197 unsigned i, elt, nelt = d->nelt;
40199 if (!d->one_operand_p)
40200 return false;
40202 elt = d->perm[0];
40203 for (i = 1; i < nelt; ++i)
40204 if (d->perm[i] != elt)
40205 return false;
40207 return expand_vec_perm_broadcast_1 (d);
40210 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
40211 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
40212 all the shorter instruction sequences. */
40214 static bool
40215 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
40217 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
40218 unsigned int i, nelt, eltsz;
40219 bool used[4];
40221 if (!TARGET_AVX2
40222 || d->one_operand_p
40223 || (d->vmode != V32QImode && d->vmode != V16HImode))
40224 return false;
40226 if (d->testing_p)
40227 return true;
40229 nelt = d->nelt;
40230 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
40232 /* Generate 4 permutation masks. If the required element is within
40233 the same lane, it is shuffled in. If the required element from the
40234 other lane, force a zero by setting bit 7 in the permutation mask.
40235 In the other mask the mask has non-negative elements if element
40236 is requested from the other lane, but also moved to the other lane,
40237 so that the result of vpshufb can have the two V2TImode halves
40238 swapped. */
40239 m128 = GEN_INT (-128);
40240 for (i = 0; i < 32; ++i)
40242 rperm[0][i] = m128;
40243 rperm[1][i] = m128;
40244 rperm[2][i] = m128;
40245 rperm[3][i] = m128;
40247 used[0] = false;
40248 used[1] = false;
40249 used[2] = false;
40250 used[3] = false;
40251 for (i = 0; i < nelt; ++i)
40253 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
40254 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
40255 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
40257 for (j = 0; j < eltsz; ++j)
40258 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
40259 used[which] = true;
40262 for (i = 0; i < 2; ++i)
40264 if (!used[2 * i + 1])
40266 h[i] = NULL_RTX;
40267 continue;
40269 vperm = gen_rtx_CONST_VECTOR (V32QImode,
40270 gen_rtvec_v (32, rperm[2 * i + 1]));
40271 vperm = force_reg (V32QImode, vperm);
40272 h[i] = gen_reg_rtx (V32QImode);
40273 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
40274 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
40277 /* Swap the 128-byte lanes of h[X]. */
40278 for (i = 0; i < 2; ++i)
40280 if (h[i] == NULL_RTX)
40281 continue;
40282 op = gen_reg_rtx (V4DImode);
40283 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
40284 const2_rtx, GEN_INT (3), const0_rtx,
40285 const1_rtx));
40286 h[i] = gen_lowpart (V32QImode, op);
40289 for (i = 0; i < 2; ++i)
40291 if (!used[2 * i])
40293 l[i] = NULL_RTX;
40294 continue;
40296 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
40297 vperm = force_reg (V32QImode, vperm);
40298 l[i] = gen_reg_rtx (V32QImode);
40299 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
40300 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
40303 for (i = 0; i < 2; ++i)
40305 if (h[i] && l[i])
40307 op = gen_reg_rtx (V32QImode);
40308 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
40309 l[i] = op;
40311 else if (h[i])
40312 l[i] = h[i];
40315 gcc_assert (l[0] && l[1]);
40316 op = gen_lowpart (V32QImode, d->target);
40317 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
40318 return true;
40321 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
40322 With all of the interface bits taken care of, perform the expansion
40323 in D and return true on success. */
40325 static bool
40326 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
40328 /* Try a single instruction expansion. */
40329 if (expand_vec_perm_1 (d))
40330 return true;
40332 /* Try sequences of two instructions. */
40334 if (expand_vec_perm_pshuflw_pshufhw (d))
40335 return true;
40337 if (expand_vec_perm_palignr (d))
40338 return true;
40340 if (expand_vec_perm_interleave2 (d))
40341 return true;
40343 if (expand_vec_perm_broadcast (d))
40344 return true;
40346 if (expand_vec_perm_vpermq_perm_1 (d))
40347 return true;
40349 if (expand_vec_perm_vperm2f128 (d))
40350 return true;
40352 /* Try sequences of three instructions. */
40354 if (expand_vec_perm_2vperm2f128_vshuf (d))
40355 return true;
40357 if (expand_vec_perm_pshufb2 (d))
40358 return true;
40360 if (expand_vec_perm_interleave3 (d))
40361 return true;
40363 if (expand_vec_perm_vperm2f128_vblend (d))
40364 return true;
40366 /* Try sequences of four instructions. */
40368 if (expand_vec_perm_vpshufb2_vpermq (d))
40369 return true;
40371 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
40372 return true;
40374 /* ??? Look for narrow permutations whose element orderings would
40375 allow the promotion to a wider mode. */
40377 /* ??? Look for sequences of interleave or a wider permute that place
40378 the data into the correct lanes for a half-vector shuffle like
40379 pshuf[lh]w or vpermilps. */
40381 /* ??? Look for sequences of interleave that produce the desired results.
40382 The combinatorics of punpck[lh] get pretty ugly... */
40384 if (expand_vec_perm_even_odd (d))
40385 return true;
40387 /* Even longer sequences. */
40388 if (expand_vec_perm_vpshufb4_vpermq2 (d))
40389 return true;
40391 return false;
40394 /* If a permutation only uses one operand, make it clear. Returns true
40395 if the permutation references both operands. */
40397 static bool
40398 canonicalize_perm (struct expand_vec_perm_d *d)
40400 int i, which, nelt = d->nelt;
40402 for (i = which = 0; i < nelt; ++i)
40403 which |= (d->perm[i] < nelt ? 1 : 2);
40405 d->one_operand_p = true;
40406 switch (which)
40408 default:
40409 gcc_unreachable();
40411 case 3:
40412 if (!rtx_equal_p (d->op0, d->op1))
40414 d->one_operand_p = false;
40415 break;
40417 /* The elements of PERM do not suggest that only the first operand
40418 is used, but both operands are identical. Allow easier matching
40419 of the permutation by folding the permutation into the single
40420 input vector. */
40421 /* FALLTHRU */
40423 case 2:
40424 for (i = 0; i < nelt; ++i)
40425 d->perm[i] &= nelt - 1;
40426 d->op0 = d->op1;
40427 break;
40429 case 1:
40430 d->op1 = d->op0;
40431 break;
40434 return (which == 3);
40437 bool
40438 ix86_expand_vec_perm_const (rtx operands[4])
40440 struct expand_vec_perm_d d;
40441 unsigned char perm[MAX_VECT_LEN];
40442 int i, nelt;
40443 bool two_args;
40444 rtx sel;
40446 d.target = operands[0];
40447 d.op0 = operands[1];
40448 d.op1 = operands[2];
40449 sel = operands[3];
40451 d.vmode = GET_MODE (d.target);
40452 gcc_assert (VECTOR_MODE_P (d.vmode));
40453 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40454 d.testing_p = false;
40456 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
40457 gcc_assert (XVECLEN (sel, 0) == nelt);
40458 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
40460 for (i = 0; i < nelt; ++i)
40462 rtx e = XVECEXP (sel, 0, i);
40463 int ei = INTVAL (e) & (2 * nelt - 1);
40464 d.perm[i] = ei;
40465 perm[i] = ei;
40468 two_args = canonicalize_perm (&d);
40470 if (ix86_expand_vec_perm_const_1 (&d))
40471 return true;
40473 /* If the selector says both arguments are needed, but the operands are the
40474 same, the above tried to expand with one_operand_p and flattened selector.
40475 If that didn't work, retry without one_operand_p; we succeeded with that
40476 during testing. */
40477 if (two_args && d.one_operand_p)
40479 d.one_operand_p = false;
40480 memcpy (d.perm, perm, sizeof (perm));
40481 return ix86_expand_vec_perm_const_1 (&d);
40484 return false;
40487 /* Implement targetm.vectorize.vec_perm_const_ok. */
40489 static bool
40490 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
40491 const unsigned char *sel)
40493 struct expand_vec_perm_d d;
40494 unsigned int i, nelt, which;
40495 bool ret;
40497 d.vmode = vmode;
40498 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40499 d.testing_p = true;
40501 /* Given sufficient ISA support we can just return true here
40502 for selected vector modes. */
40503 if (GET_MODE_SIZE (d.vmode) == 16)
40505 /* All implementable with a single vpperm insn. */
40506 if (TARGET_XOP)
40507 return true;
40508 /* All implementable with 2 pshufb + 1 ior. */
40509 if (TARGET_SSSE3)
40510 return true;
40511 /* All implementable with shufpd or unpck[lh]pd. */
40512 if (d.nelt == 2)
40513 return true;
40516 /* Extract the values from the vector CST into the permutation
40517 array in D. */
40518 memcpy (d.perm, sel, nelt);
40519 for (i = which = 0; i < nelt; ++i)
40521 unsigned char e = d.perm[i];
40522 gcc_assert (e < 2 * nelt);
40523 which |= (e < nelt ? 1 : 2);
40526 /* For all elements from second vector, fold the elements to first. */
40527 if (which == 2)
40528 for (i = 0; i < nelt; ++i)
40529 d.perm[i] -= nelt;
40531 /* Check whether the mask can be applied to the vector type. */
40532 d.one_operand_p = (which != 3);
40534 /* Implementable with shufps or pshufd. */
40535 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
40536 return true;
40538 /* Otherwise we have to go through the motions and see if we can
40539 figure out how to generate the requested permutation. */
40540 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
40541 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
40542 if (!d.one_operand_p)
40543 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
40545 start_sequence ();
40546 ret = ix86_expand_vec_perm_const_1 (&d);
40547 end_sequence ();
40549 return ret;
40552 void
40553 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
40555 struct expand_vec_perm_d d;
40556 unsigned i, nelt;
40558 d.target = targ;
40559 d.op0 = op0;
40560 d.op1 = op1;
40561 d.vmode = GET_MODE (targ);
40562 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40563 d.one_operand_p = false;
40564 d.testing_p = false;
40566 for (i = 0; i < nelt; ++i)
40567 d.perm[i] = i * 2 + odd;
40569 /* We'll either be able to implement the permutation directly... */
40570 if (expand_vec_perm_1 (&d))
40571 return;
40573 /* ... or we use the special-case patterns. */
40574 expand_vec_perm_even_odd_1 (&d, odd);
40577 static void
40578 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
40580 struct expand_vec_perm_d d;
40581 unsigned i, nelt, base;
40582 bool ok;
40584 d.target = targ;
40585 d.op0 = op0;
40586 d.op1 = op1;
40587 d.vmode = GET_MODE (targ);
40588 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40589 d.one_operand_p = false;
40590 d.testing_p = false;
40592 base = high_p ? nelt / 2 : 0;
40593 for (i = 0; i < nelt / 2; ++i)
40595 d.perm[i * 2] = i + base;
40596 d.perm[i * 2 + 1] = i + base + nelt;
40599 /* Note that for AVX this isn't one instruction. */
40600 ok = ix86_expand_vec_perm_const_1 (&d);
40601 gcc_assert (ok);
40605 /* Expand a vector operation CODE for a V*QImode in terms of the
40606 same operation on V*HImode. */
40608 void
40609 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
40611 enum machine_mode qimode = GET_MODE (dest);
40612 enum machine_mode himode;
40613 rtx (*gen_il) (rtx, rtx, rtx);
40614 rtx (*gen_ih) (rtx, rtx, rtx);
40615 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
40616 struct expand_vec_perm_d d;
40617 bool ok, full_interleave;
40618 bool uns_p = false;
40619 int i;
40621 switch (qimode)
40623 case V16QImode:
40624 himode = V8HImode;
40625 gen_il = gen_vec_interleave_lowv16qi;
40626 gen_ih = gen_vec_interleave_highv16qi;
40627 break;
40628 case V32QImode:
40629 himode = V16HImode;
40630 gen_il = gen_avx2_interleave_lowv32qi;
40631 gen_ih = gen_avx2_interleave_highv32qi;
40632 break;
40633 default:
40634 gcc_unreachable ();
40637 op2_l = op2_h = op2;
40638 switch (code)
40640 case MULT:
40641 /* Unpack data such that we've got a source byte in each low byte of
40642 each word. We don't care what goes into the high byte of each word.
40643 Rather than trying to get zero in there, most convenient is to let
40644 it be a copy of the low byte. */
40645 op2_l = gen_reg_rtx (qimode);
40646 op2_h = gen_reg_rtx (qimode);
40647 emit_insn (gen_il (op2_l, op2, op2));
40648 emit_insn (gen_ih (op2_h, op2, op2));
40649 /* FALLTHRU */
40651 op1_l = gen_reg_rtx (qimode);
40652 op1_h = gen_reg_rtx (qimode);
40653 emit_insn (gen_il (op1_l, op1, op1));
40654 emit_insn (gen_ih (op1_h, op1, op1));
40655 full_interleave = qimode == V16QImode;
40656 break;
40658 case ASHIFT:
40659 case LSHIFTRT:
40660 uns_p = true;
40661 /* FALLTHRU */
40662 case ASHIFTRT:
40663 op1_l = gen_reg_rtx (himode);
40664 op1_h = gen_reg_rtx (himode);
40665 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
40666 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
40667 full_interleave = true;
40668 break;
40669 default:
40670 gcc_unreachable ();
40673 /* Perform the operation. */
40674 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
40675 1, OPTAB_DIRECT);
40676 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
40677 1, OPTAB_DIRECT);
40678 gcc_assert (res_l && res_h);
40680 /* Merge the data back into the right place. */
40681 d.target = dest;
40682 d.op0 = gen_lowpart (qimode, res_l);
40683 d.op1 = gen_lowpart (qimode, res_h);
40684 d.vmode = qimode;
40685 d.nelt = GET_MODE_NUNITS (qimode);
40686 d.one_operand_p = false;
40687 d.testing_p = false;
40689 if (full_interleave)
40691 /* For SSE2, we used an full interleave, so the desired
40692 results are in the even elements. */
40693 for (i = 0; i < 32; ++i)
40694 d.perm[i] = i * 2;
40696 else
40698 /* For AVX, the interleave used above was not cross-lane. So the
40699 extraction is evens but with the second and third quarter swapped.
40700 Happily, that is even one insn shorter than even extraction. */
40701 for (i = 0; i < 32; ++i)
40702 d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
40705 ok = ix86_expand_vec_perm_const_1 (&d);
40706 gcc_assert (ok);
40708 set_unique_reg_note (get_last_insn (), REG_EQUAL,
40709 gen_rtx_fmt_ee (code, qimode, op1, op2));
40712 void
40713 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
40714 bool uns_p, bool odd_p)
40716 enum machine_mode mode = GET_MODE (op1);
40717 enum machine_mode wmode = GET_MODE (dest);
40718 rtx x;
40720 /* We only play even/odd games with vectors of SImode. */
40721 gcc_assert (mode == V4SImode || mode == V8SImode);
40723 /* If we're looking for the odd results, shift those members down to
40724 the even slots. For some cpus this is faster than a PSHUFD. */
40725 if (odd_p)
40727 if (TARGET_XOP && mode == V4SImode)
40729 x = force_reg (wmode, CONST0_RTX (wmode));
40730 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
40731 return;
40734 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
40735 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
40736 x, NULL, 1, OPTAB_DIRECT);
40737 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
40738 x, NULL, 1, OPTAB_DIRECT);
40739 op1 = gen_lowpart (mode, op1);
40740 op2 = gen_lowpart (mode, op2);
40743 if (mode == V8SImode)
40745 if (uns_p)
40746 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
40747 else
40748 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
40750 else if (uns_p)
40751 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
40752 else if (TARGET_SSE4_1)
40753 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
40754 else
40756 rtx s1, s2, t0, t1, t2;
40758 /* The easiest way to implement this without PMULDQ is to go through
40759 the motions as if we are performing a full 64-bit multiply. With
40760 the exception that we need to do less shuffling of the elements. */
40762 /* Compute the sign-extension, aka highparts, of the two operands. */
40763 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
40764 op1, pc_rtx, pc_rtx);
40765 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
40766 op2, pc_rtx, pc_rtx);
40768 /* Multiply LO(A) * HI(B), and vice-versa. */
40769 t1 = gen_reg_rtx (wmode);
40770 t2 = gen_reg_rtx (wmode);
40771 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
40772 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
40774 /* Multiply LO(A) * LO(B). */
40775 t0 = gen_reg_rtx (wmode);
40776 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
40778 /* Combine and shift the highparts into place. */
40779 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
40780 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
40781 1, OPTAB_DIRECT);
40783 /* Combine high and low parts. */
40784 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
40785 return;
40787 emit_insn (x);
40790 void
40791 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
40792 bool uns_p, bool high_p)
40794 enum machine_mode wmode = GET_MODE (dest);
40795 enum machine_mode mode = GET_MODE (op1);
40796 rtx t1, t2, t3, t4, mask;
40798 switch (mode)
40800 case V4SImode:
40801 t1 = gen_reg_rtx (mode);
40802 t2 = gen_reg_rtx (mode);
40803 if (TARGET_XOP && !uns_p)
40805 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
40806 shuffle the elements once so that all elements are in the right
40807 place for immediate use: { A C B D }. */
40808 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
40809 const1_rtx, GEN_INT (3)));
40810 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
40811 const1_rtx, GEN_INT (3)));
40813 else
40815 /* Put the elements into place for the multiply. */
40816 ix86_expand_vec_interleave (t1, op1, op1, high_p);
40817 ix86_expand_vec_interleave (t2, op2, op2, high_p);
40818 high_p = false;
40820 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
40821 break;
40823 case V8SImode:
40824 /* Shuffle the elements between the lanes. After this we
40825 have { A B E F | C D G H } for each operand. */
40826 t1 = gen_reg_rtx (V4DImode);
40827 t2 = gen_reg_rtx (V4DImode);
40828 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
40829 const0_rtx, const2_rtx,
40830 const1_rtx, GEN_INT (3)));
40831 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
40832 const0_rtx, const2_rtx,
40833 const1_rtx, GEN_INT (3)));
40835 /* Shuffle the elements within the lanes. After this we
40836 have { A A B B | C C D D } or { E E F F | G G H H }. */
40837 t3 = gen_reg_rtx (V8SImode);
40838 t4 = gen_reg_rtx (V8SImode);
40839 mask = GEN_INT (high_p
40840 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
40841 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
40842 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
40843 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
40845 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
40846 break;
40848 case V8HImode:
40849 case V16HImode:
40850 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
40851 uns_p, OPTAB_DIRECT);
40852 t2 = expand_binop (mode,
40853 uns_p ? umul_highpart_optab : smul_highpart_optab,
40854 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
40855 gcc_assert (t1 && t2);
40857 ix86_expand_vec_interleave (gen_lowpart (mode, dest), t1, t2, high_p);
40858 break;
40860 case V16QImode:
40861 case V32QImode:
40862 t1 = gen_reg_rtx (wmode);
40863 t2 = gen_reg_rtx (wmode);
40864 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
40865 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
40867 emit_insn (gen_rtx_SET (VOIDmode, dest, gen_rtx_MULT (wmode, t1, t2)));
40868 break;
40870 default:
40871 gcc_unreachable ();
40875 void
40876 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
40878 rtx res_1, res_2;
40880 res_1 = gen_reg_rtx (V4SImode);
40881 res_2 = gen_reg_rtx (V4SImode);
40882 ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_1),
40883 op1, op2, true, false);
40884 ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_2),
40885 op1, op2, true, true);
40887 /* Move the results in element 2 down to element 1; we don't care
40888 what goes in elements 2 and 3. Then we can merge the parts
40889 back together with an interleave.
40891 Note that two other sequences were tried:
40892 (1) Use interleaves at the start instead of psrldq, which allows
40893 us to use a single shufps to merge things back at the end.
40894 (2) Use shufps here to combine the two vectors, then pshufd to
40895 put the elements in the correct order.
40896 In both cases the cost of the reformatting stall was too high
40897 and the overall sequence slower. */
40899 emit_insn (gen_sse2_pshufd_1 (res_1, res_1, const0_rtx, const2_rtx,
40900 const0_rtx, const0_rtx));
40901 emit_insn (gen_sse2_pshufd_1 (res_2, res_2, const0_rtx, const2_rtx,
40902 const0_rtx, const0_rtx));
40903 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
40905 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
40908 void
40909 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
40911 enum machine_mode mode = GET_MODE (op0);
40912 rtx t1, t2, t3, t4, t5, t6;
40914 if (TARGET_XOP && mode == V2DImode)
40916 /* op1: A,B,C,D, op2: E,F,G,H */
40917 op1 = gen_lowpart (V4SImode, op1);
40918 op2 = gen_lowpart (V4SImode, op2);
40920 t1 = gen_reg_rtx (V4SImode);
40921 t2 = gen_reg_rtx (V4SImode);
40922 t3 = gen_reg_rtx (V2DImode);
40923 t4 = gen_reg_rtx (V2DImode);
40925 /* t1: B,A,D,C */
40926 emit_insn (gen_sse2_pshufd_1 (t1, op1,
40927 GEN_INT (1),
40928 GEN_INT (0),
40929 GEN_INT (3),
40930 GEN_INT (2)));
40932 /* t2: (B*E),(A*F),(D*G),(C*H) */
40933 emit_insn (gen_mulv4si3 (t2, t1, op2));
40935 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
40936 emit_insn (gen_xop_phadddq (t3, t2));
40938 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
40939 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
40941 /* op0: (((B*E)+(A*F))<<32)+(B*F), (((D*G)+(C*H))<<32)+(D*H) */
40942 emit_insn (gen_xop_pmacsdql (op0, op1, op2, t4));
40944 else
40946 enum machine_mode nmode;
40947 rtx (*umul) (rtx, rtx, rtx);
40949 if (mode == V2DImode)
40951 umul = gen_vec_widen_umult_even_v4si;
40952 nmode = V4SImode;
40954 else if (mode == V4DImode)
40956 umul = gen_vec_widen_umult_even_v8si;
40957 nmode = V8SImode;
40959 else
40960 gcc_unreachable ();
40963 /* Multiply low parts. */
40964 t1 = gen_reg_rtx (mode);
40965 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
40967 /* Shift input vectors right 32 bits so we can multiply high parts. */
40968 t6 = GEN_INT (32);
40969 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
40970 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
40972 /* Multiply high parts by low parts. */
40973 t4 = gen_reg_rtx (mode);
40974 t5 = gen_reg_rtx (mode);
40975 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
40976 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
40978 /* Combine and shift the highparts back. */
40979 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
40980 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
40982 /* Combine high and low parts. */
40983 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
40986 set_unique_reg_note (get_last_insn (), REG_EQUAL,
40987 gen_rtx_MULT (mode, op1, op2));
40990 /* Expand an insert into a vector register through pinsr insn.
40991 Return true if successful. */
40993 bool
40994 ix86_expand_pinsr (rtx *operands)
40996 rtx dst = operands[0];
40997 rtx src = operands[3];
40999 unsigned int size = INTVAL (operands[1]);
41000 unsigned int pos = INTVAL (operands[2]);
41002 if (GET_CODE (dst) == SUBREG)
41004 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
41005 dst = SUBREG_REG (dst);
41008 if (GET_CODE (src) == SUBREG)
41009 src = SUBREG_REG (src);
41011 switch (GET_MODE (dst))
41013 case V16QImode:
41014 case V8HImode:
41015 case V4SImode:
41016 case V2DImode:
41018 enum machine_mode srcmode, dstmode;
41019 rtx (*pinsr)(rtx, rtx, rtx, rtx);
41021 srcmode = mode_for_size (size, MODE_INT, 0);
41023 switch (srcmode)
41025 case QImode:
41026 if (!TARGET_SSE4_1)
41027 return false;
41028 dstmode = V16QImode;
41029 pinsr = gen_sse4_1_pinsrb;
41030 break;
41032 case HImode:
41033 if (!TARGET_SSE2)
41034 return false;
41035 dstmode = V8HImode;
41036 pinsr = gen_sse2_pinsrw;
41037 break;
41039 case SImode:
41040 if (!TARGET_SSE4_1)
41041 return false;
41042 dstmode = V4SImode;
41043 pinsr = gen_sse4_1_pinsrd;
41044 break;
41046 case DImode:
41047 gcc_assert (TARGET_64BIT);
41048 if (!TARGET_SSE4_1)
41049 return false;
41050 dstmode = V2DImode;
41051 pinsr = gen_sse4_1_pinsrq;
41052 break;
41054 default:
41055 return false;
41058 dst = gen_lowpart (dstmode, dst);
41059 src = gen_lowpart (srcmode, src);
41061 pos /= size;
41063 emit_insn (pinsr (dst, dst, src, GEN_INT (1 << pos)));
41064 return true;
41067 default:
41068 return false;
41072 /* This function returns the calling abi specific va_list type node.
41073 It returns the FNDECL specific va_list type. */
41075 static tree
41076 ix86_fn_abi_va_list (tree fndecl)
41078 if (!TARGET_64BIT)
41079 return va_list_type_node;
41080 gcc_assert (fndecl != NULL_TREE);
41082 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
41083 return ms_va_list_type_node;
41084 else
41085 return sysv_va_list_type_node;
41088 /* Returns the canonical va_list type specified by TYPE. If there
41089 is no valid TYPE provided, it return NULL_TREE. */
41091 static tree
41092 ix86_canonical_va_list_type (tree type)
41094 tree wtype, htype;
41096 /* Resolve references and pointers to va_list type. */
41097 if (TREE_CODE (type) == MEM_REF)
41098 type = TREE_TYPE (type);
41099 else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
41100 type = TREE_TYPE (type);
41101 else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
41102 type = TREE_TYPE (type);
41104 if (TARGET_64BIT && va_list_type_node != NULL_TREE)
41106 wtype = va_list_type_node;
41107 gcc_assert (wtype != NULL_TREE);
41108 htype = type;
41109 if (TREE_CODE (wtype) == ARRAY_TYPE)
41111 /* If va_list is an array type, the argument may have decayed
41112 to a pointer type, e.g. by being passed to another function.
41113 In that case, unwrap both types so that we can compare the
41114 underlying records. */
41115 if (TREE_CODE (htype) == ARRAY_TYPE
41116 || POINTER_TYPE_P (htype))
41118 wtype = TREE_TYPE (wtype);
41119 htype = TREE_TYPE (htype);
41122 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
41123 return va_list_type_node;
41124 wtype = sysv_va_list_type_node;
41125 gcc_assert (wtype != NULL_TREE);
41126 htype = type;
41127 if (TREE_CODE (wtype) == ARRAY_TYPE)
41129 /* If va_list is an array type, the argument may have decayed
41130 to a pointer type, e.g. by being passed to another function.
41131 In that case, unwrap both types so that we can compare the
41132 underlying records. */
41133 if (TREE_CODE (htype) == ARRAY_TYPE
41134 || POINTER_TYPE_P (htype))
41136 wtype = TREE_TYPE (wtype);
41137 htype = TREE_TYPE (htype);
41140 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
41141 return sysv_va_list_type_node;
41142 wtype = ms_va_list_type_node;
41143 gcc_assert (wtype != NULL_TREE);
41144 htype = type;
41145 if (TREE_CODE (wtype) == ARRAY_TYPE)
41147 /* If va_list is an array type, the argument may have decayed
41148 to a pointer type, e.g. by being passed to another function.
41149 In that case, unwrap both types so that we can compare the
41150 underlying records. */
41151 if (TREE_CODE (htype) == ARRAY_TYPE
41152 || POINTER_TYPE_P (htype))
41154 wtype = TREE_TYPE (wtype);
41155 htype = TREE_TYPE (htype);
41158 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
41159 return ms_va_list_type_node;
41160 return NULL_TREE;
41162 return std_canonical_va_list_type (type);
41165 /* Iterate through the target-specific builtin types for va_list.
41166 IDX denotes the iterator, *PTREE is set to the result type of
41167 the va_list builtin, and *PNAME to its internal type.
41168 Returns zero if there is no element for this index, otherwise
41169 IDX should be increased upon the next call.
41170 Note, do not iterate a base builtin's name like __builtin_va_list.
41171 Used from c_common_nodes_and_builtins. */
41173 static int
41174 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
41176 if (TARGET_64BIT)
41178 switch (idx)
41180 default:
41181 break;
41183 case 0:
41184 *ptree = ms_va_list_type_node;
41185 *pname = "__builtin_ms_va_list";
41186 return 1;
41188 case 1:
41189 *ptree = sysv_va_list_type_node;
41190 *pname = "__builtin_sysv_va_list";
41191 return 1;
41195 return 0;
41198 #undef TARGET_SCHED_DISPATCH
41199 #define TARGET_SCHED_DISPATCH has_dispatch
41200 #undef TARGET_SCHED_DISPATCH_DO
41201 #define TARGET_SCHED_DISPATCH_DO do_dispatch
41202 #undef TARGET_SCHED_REASSOCIATION_WIDTH
41203 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
41204 #undef TARGET_SCHED_REORDER
41205 #define TARGET_SCHED_REORDER ix86_sched_reorder
41206 #undef TARGET_SCHED_ADJUST_PRIORITY
41207 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
41208 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
41209 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK ix86_dependencies_evaluation_hook
41211 /* The size of the dispatch window is the total number of bytes of
41212 object code allowed in a window. */
41213 #define DISPATCH_WINDOW_SIZE 16
41215 /* Number of dispatch windows considered for scheduling. */
41216 #define MAX_DISPATCH_WINDOWS 3
41218 /* Maximum number of instructions in a window. */
41219 #define MAX_INSN 4
41221 /* Maximum number of immediate operands in a window. */
41222 #define MAX_IMM 4
41224 /* Maximum number of immediate bits allowed in a window. */
41225 #define MAX_IMM_SIZE 128
41227 /* Maximum number of 32 bit immediates allowed in a window. */
41228 #define MAX_IMM_32 4
41230 /* Maximum number of 64 bit immediates allowed in a window. */
41231 #define MAX_IMM_64 2
41233 /* Maximum total of loads or prefetches allowed in a window. */
41234 #define MAX_LOAD 2
41236 /* Maximum total of stores allowed in a window. */
41237 #define MAX_STORE 1
41239 #undef BIG
41240 #define BIG 100
41243 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
41244 enum dispatch_group {
41245 disp_no_group = 0,
41246 disp_load,
41247 disp_store,
41248 disp_load_store,
41249 disp_prefetch,
41250 disp_imm,
41251 disp_imm_32,
41252 disp_imm_64,
41253 disp_branch,
41254 disp_cmp,
41255 disp_jcc,
41256 disp_last
41259 /* Number of allowable groups in a dispatch window. It is an array
41260 indexed by dispatch_group enum. 100 is used as a big number,
41261 because the number of these kind of operations does not have any
41262 effect in dispatch window, but we need them for other reasons in
41263 the table. */
41264 static unsigned int num_allowable_groups[disp_last] = {
41265 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
41268 char group_name[disp_last + 1][16] = {
41269 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
41270 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
41271 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
41274 /* Instruction path. */
41275 enum insn_path {
41276 no_path = 0,
41277 path_single, /* Single micro op. */
41278 path_double, /* Double micro op. */
41279 path_multi, /* Instructions with more than 2 micro op.. */
41280 last_path
41283 /* sched_insn_info defines a window to the instructions scheduled in
41284 the basic block. It contains a pointer to the insn_info table and
41285 the instruction scheduled.
41287 Windows are allocated for each basic block and are linked
41288 together. */
41289 typedef struct sched_insn_info_s {
41290 rtx insn;
41291 enum dispatch_group group;
41292 enum insn_path path;
41293 int byte_len;
41294 int imm_bytes;
41295 } sched_insn_info;
41297 /* Linked list of dispatch windows. This is a two way list of
41298 dispatch windows of a basic block. It contains information about
41299 the number of uops in the window and the total number of
41300 instructions and of bytes in the object code for this dispatch
41301 window. */
41302 typedef struct dispatch_windows_s {
41303 int num_insn; /* Number of insn in the window. */
41304 int num_uops; /* Number of uops in the window. */
41305 int window_size; /* Number of bytes in the window. */
41306 int window_num; /* Window number between 0 or 1. */
41307 int num_imm; /* Number of immediates in an insn. */
41308 int num_imm_32; /* Number of 32 bit immediates in an insn. */
41309 int num_imm_64; /* Number of 64 bit immediates in an insn. */
41310 int imm_size; /* Total immediates in the window. */
41311 int num_loads; /* Total memory loads in the window. */
41312 int num_stores; /* Total memory stores in the window. */
41313 int violation; /* Violation exists in window. */
41314 sched_insn_info *window; /* Pointer to the window. */
41315 struct dispatch_windows_s *next;
41316 struct dispatch_windows_s *prev;
41317 } dispatch_windows;
41319 /* Immediate valuse used in an insn. */
41320 typedef struct imm_info_s
41322 int imm;
41323 int imm32;
41324 int imm64;
41325 } imm_info;
41327 static dispatch_windows *dispatch_window_list;
41328 static dispatch_windows *dispatch_window_list1;
41330 /* Get dispatch group of insn. */
41332 static enum dispatch_group
41333 get_mem_group (rtx insn)
41335 enum attr_memory memory;
41337 if (INSN_CODE (insn) < 0)
41338 return disp_no_group;
41339 memory = get_attr_memory (insn);
41340 if (memory == MEMORY_STORE)
41341 return disp_store;
41343 if (memory == MEMORY_LOAD)
41344 return disp_load;
41346 if (memory == MEMORY_BOTH)
41347 return disp_load_store;
41349 return disp_no_group;
41352 /* Return true if insn is a compare instruction. */
41354 static bool
41355 is_cmp (rtx insn)
41357 enum attr_type type;
41359 type = get_attr_type (insn);
41360 return (type == TYPE_TEST
41361 || type == TYPE_ICMP
41362 || type == TYPE_FCMP
41363 || GET_CODE (PATTERN (insn)) == COMPARE);
41366 /* Return true if a dispatch violation encountered. */
41368 static bool
41369 dispatch_violation (void)
41371 if (dispatch_window_list->next)
41372 return dispatch_window_list->next->violation;
41373 return dispatch_window_list->violation;
41376 /* Return true if insn is a branch instruction. */
41378 static bool
41379 is_branch (rtx insn)
41381 return (CALL_P (insn) || JUMP_P (insn));
41384 /* Return true if insn is a prefetch instruction. */
41386 static bool
41387 is_prefetch (rtx insn)
41389 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
41392 /* This function initializes a dispatch window and the list container holding a
41393 pointer to the window. */
41395 static void
41396 init_window (int window_num)
41398 int i;
41399 dispatch_windows *new_list;
41401 if (window_num == 0)
41402 new_list = dispatch_window_list;
41403 else
41404 new_list = dispatch_window_list1;
41406 new_list->num_insn = 0;
41407 new_list->num_uops = 0;
41408 new_list->window_size = 0;
41409 new_list->next = NULL;
41410 new_list->prev = NULL;
41411 new_list->window_num = window_num;
41412 new_list->num_imm = 0;
41413 new_list->num_imm_32 = 0;
41414 new_list->num_imm_64 = 0;
41415 new_list->imm_size = 0;
41416 new_list->num_loads = 0;
41417 new_list->num_stores = 0;
41418 new_list->violation = false;
41420 for (i = 0; i < MAX_INSN; i++)
41422 new_list->window[i].insn = NULL;
41423 new_list->window[i].group = disp_no_group;
41424 new_list->window[i].path = no_path;
41425 new_list->window[i].byte_len = 0;
41426 new_list->window[i].imm_bytes = 0;
41428 return;
41431 /* This function allocates and initializes a dispatch window and the
41432 list container holding a pointer to the window. */
41434 static dispatch_windows *
41435 allocate_window (void)
41437 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
41438 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
41440 return new_list;
41443 /* This routine initializes the dispatch scheduling information. It
41444 initiates building dispatch scheduler tables and constructs the
41445 first dispatch window. */
41447 static void
41448 init_dispatch_sched (void)
41450 /* Allocate a dispatch list and a window. */
41451 dispatch_window_list = allocate_window ();
41452 dispatch_window_list1 = allocate_window ();
41453 init_window (0);
41454 init_window (1);
41457 /* This function returns true if a branch is detected. End of a basic block
41458 does not have to be a branch, but here we assume only branches end a
41459 window. */
41461 static bool
41462 is_end_basic_block (enum dispatch_group group)
41464 return group == disp_branch;
41467 /* This function is called when the end of a window processing is reached. */
41469 static void
41470 process_end_window (void)
41472 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
41473 if (dispatch_window_list->next)
41475 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
41476 gcc_assert (dispatch_window_list->window_size
41477 + dispatch_window_list1->window_size <= 48);
41478 init_window (1);
41480 init_window (0);
41483 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
41484 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
41485 for 48 bytes of instructions. Note that these windows are not dispatch
41486 windows that their sizes are DISPATCH_WINDOW_SIZE. */
41488 static dispatch_windows *
41489 allocate_next_window (int window_num)
41491 if (window_num == 0)
41493 if (dispatch_window_list->next)
41494 init_window (1);
41495 init_window (0);
41496 return dispatch_window_list;
41499 dispatch_window_list->next = dispatch_window_list1;
41500 dispatch_window_list1->prev = dispatch_window_list;
41502 return dispatch_window_list1;
41505 /* Increment the number of immediate operands of an instruction. */
41507 static int
41508 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
41510 if (*in_rtx == 0)
41511 return 0;
41513 switch ( GET_CODE (*in_rtx))
41515 case CONST:
41516 case SYMBOL_REF:
41517 case CONST_INT:
41518 (imm_values->imm)++;
41519 if (x86_64_immediate_operand (*in_rtx, SImode))
41520 (imm_values->imm32)++;
41521 else
41522 (imm_values->imm64)++;
41523 break;
41525 case CONST_DOUBLE:
41526 (imm_values->imm)++;
41527 (imm_values->imm64)++;
41528 break;
41530 case CODE_LABEL:
41531 if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
41533 (imm_values->imm)++;
41534 (imm_values->imm32)++;
41536 break;
41538 default:
41539 break;
41542 return 0;
41545 /* Compute number of immediate operands of an instruction. */
41547 static void
41548 find_constant (rtx in_rtx, imm_info *imm_values)
41550 for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
41551 (rtx_function) find_constant_1, (void *) imm_values);
41554 /* Return total size of immediate operands of an instruction along with number
41555 of corresponding immediate-operands. It initializes its parameters to zero
41556 befor calling FIND_CONSTANT.
41557 INSN is the input instruction. IMM is the total of immediates.
41558 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
41559 bit immediates. */
41561 static int
41562 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
41564 imm_info imm_values = {0, 0, 0};
41566 find_constant (insn, &imm_values);
41567 *imm = imm_values.imm;
41568 *imm32 = imm_values.imm32;
41569 *imm64 = imm_values.imm64;
41570 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
41573 /* This function indicates if an operand of an instruction is an
41574 immediate. */
41576 static bool
41577 has_immediate (rtx insn)
41579 int num_imm_operand;
41580 int num_imm32_operand;
41581 int num_imm64_operand;
41583 if (insn)
41584 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
41585 &num_imm64_operand);
41586 return false;
41589 /* Return single or double path for instructions. */
41591 static enum insn_path
41592 get_insn_path (rtx insn)
41594 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
41596 if ((int)path == 0)
41597 return path_single;
41599 if ((int)path == 1)
41600 return path_double;
41602 return path_multi;
41605 /* Return insn dispatch group. */
41607 static enum dispatch_group
41608 get_insn_group (rtx insn)
41610 enum dispatch_group group = get_mem_group (insn);
41611 if (group)
41612 return group;
41614 if (is_branch (insn))
41615 return disp_branch;
41617 if (is_cmp (insn))
41618 return disp_cmp;
41620 if (has_immediate (insn))
41621 return disp_imm;
41623 if (is_prefetch (insn))
41624 return disp_prefetch;
41626 return disp_no_group;
41629 /* Count number of GROUP restricted instructions in a dispatch
41630 window WINDOW_LIST. */
41632 static int
41633 count_num_restricted (rtx insn, dispatch_windows *window_list)
41635 enum dispatch_group group = get_insn_group (insn);
41636 int imm_size;
41637 int num_imm_operand;
41638 int num_imm32_operand;
41639 int num_imm64_operand;
41641 if (group == disp_no_group)
41642 return 0;
41644 if (group == disp_imm)
41646 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
41647 &num_imm64_operand);
41648 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
41649 || num_imm_operand + window_list->num_imm > MAX_IMM
41650 || (num_imm32_operand > 0
41651 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
41652 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
41653 || (num_imm64_operand > 0
41654 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
41655 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
41656 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
41657 && num_imm64_operand > 0
41658 && ((window_list->num_imm_64 > 0
41659 && window_list->num_insn >= 2)
41660 || window_list->num_insn >= 3)))
41661 return BIG;
41663 return 1;
41666 if ((group == disp_load_store
41667 && (window_list->num_loads >= MAX_LOAD
41668 || window_list->num_stores >= MAX_STORE))
41669 || ((group == disp_load
41670 || group == disp_prefetch)
41671 && window_list->num_loads >= MAX_LOAD)
41672 || (group == disp_store
41673 && window_list->num_stores >= MAX_STORE))
41674 return BIG;
41676 return 1;
41679 /* This function returns true if insn satisfies dispatch rules on the
41680 last window scheduled. */
41682 static bool
41683 fits_dispatch_window (rtx insn)
41685 dispatch_windows *window_list = dispatch_window_list;
41686 dispatch_windows *window_list_next = dispatch_window_list->next;
41687 unsigned int num_restrict;
41688 enum dispatch_group group = get_insn_group (insn);
41689 enum insn_path path = get_insn_path (insn);
41690 int sum;
41692 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
41693 instructions should be given the lowest priority in the
41694 scheduling process in Haifa scheduler to make sure they will be
41695 scheduled in the same dispatch window as the reference to them. */
41696 if (group == disp_jcc || group == disp_cmp)
41697 return false;
41699 /* Check nonrestricted. */
41700 if (group == disp_no_group || group == disp_branch)
41701 return true;
41703 /* Get last dispatch window. */
41704 if (window_list_next)
41705 window_list = window_list_next;
41707 if (window_list->window_num == 1)
41709 sum = window_list->prev->window_size + window_list->window_size;
41711 if (sum == 32
41712 || (min_insn_size (insn) + sum) >= 48)
41713 /* Window 1 is full. Go for next window. */
41714 return true;
41717 num_restrict = count_num_restricted (insn, window_list);
41719 if (num_restrict > num_allowable_groups[group])
41720 return false;
41722 /* See if it fits in the first window. */
41723 if (window_list->window_num == 0)
41725 /* The first widow should have only single and double path
41726 uops. */
41727 if (path == path_double
41728 && (window_list->num_uops + 2) > MAX_INSN)
41729 return false;
41730 else if (path != path_single)
41731 return false;
41733 return true;
41736 /* Add an instruction INSN with NUM_UOPS micro-operations to the
41737 dispatch window WINDOW_LIST. */
41739 static void
41740 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
41742 int byte_len = min_insn_size (insn);
41743 int num_insn = window_list->num_insn;
41744 int imm_size;
41745 sched_insn_info *window = window_list->window;
41746 enum dispatch_group group = get_insn_group (insn);
41747 enum insn_path path = get_insn_path (insn);
41748 int num_imm_operand;
41749 int num_imm32_operand;
41750 int num_imm64_operand;
41752 if (!window_list->violation && group != disp_cmp
41753 && !fits_dispatch_window (insn))
41754 window_list->violation = true;
41756 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
41757 &num_imm64_operand);
41759 /* Initialize window with new instruction. */
41760 window[num_insn].insn = insn;
41761 window[num_insn].byte_len = byte_len;
41762 window[num_insn].group = group;
41763 window[num_insn].path = path;
41764 window[num_insn].imm_bytes = imm_size;
41766 window_list->window_size += byte_len;
41767 window_list->num_insn = num_insn + 1;
41768 window_list->num_uops = window_list->num_uops + num_uops;
41769 window_list->imm_size += imm_size;
41770 window_list->num_imm += num_imm_operand;
41771 window_list->num_imm_32 += num_imm32_operand;
41772 window_list->num_imm_64 += num_imm64_operand;
41774 if (group == disp_store)
41775 window_list->num_stores += 1;
41776 else if (group == disp_load
41777 || group == disp_prefetch)
41778 window_list->num_loads += 1;
41779 else if (group == disp_load_store)
41781 window_list->num_stores += 1;
41782 window_list->num_loads += 1;
41786 /* Adds a scheduled instruction, INSN, to the current dispatch window.
41787 If the total bytes of instructions or the number of instructions in
41788 the window exceed allowable, it allocates a new window. */
41790 static void
41791 add_to_dispatch_window (rtx insn)
41793 int byte_len;
41794 dispatch_windows *window_list;
41795 dispatch_windows *next_list;
41796 dispatch_windows *window0_list;
41797 enum insn_path path;
41798 enum dispatch_group insn_group;
41799 bool insn_fits;
41800 int num_insn;
41801 int num_uops;
41802 int window_num;
41803 int insn_num_uops;
41804 int sum;
41806 if (INSN_CODE (insn) < 0)
41807 return;
41809 byte_len = min_insn_size (insn);
41810 window_list = dispatch_window_list;
41811 next_list = window_list->next;
41812 path = get_insn_path (insn);
41813 insn_group = get_insn_group (insn);
41815 /* Get the last dispatch window. */
41816 if (next_list)
41817 window_list = dispatch_window_list->next;
41819 if (path == path_single)
41820 insn_num_uops = 1;
41821 else if (path == path_double)
41822 insn_num_uops = 2;
41823 else
41824 insn_num_uops = (int) path;
41826 /* If current window is full, get a new window.
41827 Window number zero is full, if MAX_INSN uops are scheduled in it.
41828 Window number one is full, if window zero's bytes plus window
41829 one's bytes is 32, or if the bytes of the new instruction added
41830 to the total makes it greater than 48, or it has already MAX_INSN
41831 instructions in it. */
41832 num_insn = window_list->num_insn;
41833 num_uops = window_list->num_uops;
41834 window_num = window_list->window_num;
41835 insn_fits = fits_dispatch_window (insn);
41837 if (num_insn >= MAX_INSN
41838 || num_uops + insn_num_uops > MAX_INSN
41839 || !(insn_fits))
41841 window_num = ~window_num & 1;
41842 window_list = allocate_next_window (window_num);
41845 if (window_num == 0)
41847 add_insn_window (insn, window_list, insn_num_uops);
41848 if (window_list->num_insn >= MAX_INSN
41849 && insn_group == disp_branch)
41851 process_end_window ();
41852 return;
41855 else if (window_num == 1)
41857 window0_list = window_list->prev;
41858 sum = window0_list->window_size + window_list->window_size;
41859 if (sum == 32
41860 || (byte_len + sum) >= 48)
41862 process_end_window ();
41863 window_list = dispatch_window_list;
41866 add_insn_window (insn, window_list, insn_num_uops);
41868 else
41869 gcc_unreachable ();
41871 if (is_end_basic_block (insn_group))
41873 /* End of basic block is reached do end-basic-block process. */
41874 process_end_window ();
41875 return;
41879 /* Print the dispatch window, WINDOW_NUM, to FILE. */
41881 DEBUG_FUNCTION static void
41882 debug_dispatch_window_file (FILE *file, int window_num)
41884 dispatch_windows *list;
41885 int i;
41887 if (window_num == 0)
41888 list = dispatch_window_list;
41889 else
41890 list = dispatch_window_list1;
41892 fprintf (file, "Window #%d:\n", list->window_num);
41893 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
41894 list->num_insn, list->num_uops, list->window_size);
41895 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
41896 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
41898 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
41899 list->num_stores);
41900 fprintf (file, " insn info:\n");
41902 for (i = 0; i < MAX_INSN; i++)
41904 if (!list->window[i].insn)
41905 break;
41906 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
41907 i, group_name[list->window[i].group],
41908 i, (void *)list->window[i].insn,
41909 i, list->window[i].path,
41910 i, list->window[i].byte_len,
41911 i, list->window[i].imm_bytes);
41915 /* Print to stdout a dispatch window. */
41917 DEBUG_FUNCTION void
41918 debug_dispatch_window (int window_num)
41920 debug_dispatch_window_file (stdout, window_num);
41923 /* Print INSN dispatch information to FILE. */
41925 DEBUG_FUNCTION static void
41926 debug_insn_dispatch_info_file (FILE *file, rtx insn)
41928 int byte_len;
41929 enum insn_path path;
41930 enum dispatch_group group;
41931 int imm_size;
41932 int num_imm_operand;
41933 int num_imm32_operand;
41934 int num_imm64_operand;
41936 if (INSN_CODE (insn) < 0)
41937 return;
41939 byte_len = min_insn_size (insn);
41940 path = get_insn_path (insn);
41941 group = get_insn_group (insn);
41942 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
41943 &num_imm64_operand);
41945 fprintf (file, " insn info:\n");
41946 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
41947 group_name[group], path, byte_len);
41948 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
41949 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
41952 /* Print to STDERR the status of the ready list with respect to
41953 dispatch windows. */
41955 DEBUG_FUNCTION void
41956 debug_ready_dispatch (void)
41958 int i;
41959 int no_ready = number_in_ready ();
41961 fprintf (stdout, "Number of ready: %d\n", no_ready);
41963 for (i = 0; i < no_ready; i++)
41964 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
41967 /* This routine is the driver of the dispatch scheduler. */
41969 static void
41970 do_dispatch (rtx insn, int mode)
41972 if (mode == DISPATCH_INIT)
41973 init_dispatch_sched ();
41974 else if (mode == ADD_TO_DISPATCH_WINDOW)
41975 add_to_dispatch_window (insn);
41978 /* Return TRUE if Dispatch Scheduling is supported. */
41980 static bool
41981 has_dispatch (rtx insn, int action)
41983 if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3)
41984 && flag_dispatch_scheduler)
41985 switch (action)
41987 default:
41988 return false;
41990 case IS_DISPATCH_ON:
41991 return true;
41992 break;
41994 case IS_CMP:
41995 return is_cmp (insn);
41997 case DISPATCH_VIOLATION:
41998 return dispatch_violation ();
42000 case FITS_DISPATCH_WINDOW:
42001 return fits_dispatch_window (insn);
42004 return false;
42007 /* Implementation of reassociation_width target hook used by
42008 reassoc phase to identify parallelism level in reassociated
42009 tree. Statements tree_code is passed in OPC. Arguments type
42010 is passed in MODE.
42012 Currently parallel reassociation is enabled for Atom
42013 processors only and we set reassociation width to be 2
42014 because Atom may issue up to 2 instructions per cycle.
42016 Return value should be fixed if parallel reassociation is
42017 enabled for other processors. */
42019 static int
42020 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
42021 enum machine_mode mode)
42023 int res = 1;
42025 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
42026 res = 2;
42027 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
42028 res = 2;
42030 return res;
42033 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
42034 place emms and femms instructions. */
42036 static enum machine_mode
42037 ix86_preferred_simd_mode (enum machine_mode mode)
42039 if (!TARGET_SSE)
42040 return word_mode;
42042 switch (mode)
42044 case QImode:
42045 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
42046 case HImode:
42047 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
42048 case SImode:
42049 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
42050 case DImode:
42051 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
42053 case SFmode:
42054 if (TARGET_AVX && !TARGET_PREFER_AVX128)
42055 return V8SFmode;
42056 else
42057 return V4SFmode;
42059 case DFmode:
42060 if (!TARGET_VECTORIZE_DOUBLE)
42061 return word_mode;
42062 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
42063 return V4DFmode;
42064 else if (TARGET_SSE2)
42065 return V2DFmode;
42066 /* FALLTHRU */
42068 default:
42069 return word_mode;
42073 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
42074 vectors. */
42076 static unsigned int
42077 ix86_autovectorize_vector_sizes (void)
42079 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
42084 /* Return class of registers which could be used for pseudo of MODE
42085 and of class RCLASS for spilling instead of memory. Return NO_REGS
42086 if it is not possible or non-profitable. */
42087 static reg_class_t
42088 ix86_spill_class (reg_class_t rclass, enum machine_mode mode)
42090 if (TARGET_SSE && TARGET_GENERAL_REGS_SSE_SPILL && ! TARGET_MMX
42091 && hard_reg_set_subset_p (reg_class_contents[rclass],
42092 reg_class_contents[GENERAL_REGS])
42093 && (mode == SImode || (TARGET_64BIT && mode == DImode)))
42094 return SSE_REGS;
42095 return NO_REGS;
42098 /* Implement targetm.vectorize.init_cost. */
42100 static void *
42101 ix86_init_cost (struct loop *loop_info ATTRIBUTE_UNUSED)
42103 unsigned *cost = XNEWVEC (unsigned, 3);
42104 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
42105 return cost;
42108 /* Implement targetm.vectorize.add_stmt_cost. */
42110 static unsigned
42111 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
42112 struct _stmt_vec_info *stmt_info, int misalign,
42113 enum vect_cost_model_location where)
42115 unsigned *cost = (unsigned *) data;
42116 unsigned retval = 0;
42118 if (flag_vect_cost_model)
42120 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
42121 int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
42123 /* Statements in an inner loop relative to the loop being
42124 vectorized are weighted more heavily. The value here is
42125 arbitrary and could potentially be improved with analysis. */
42126 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
42127 count *= 50; /* FIXME. */
42129 retval = (unsigned) (count * stmt_cost);
42130 cost[where] += retval;
42133 return retval;
42136 /* Implement targetm.vectorize.finish_cost. */
42138 static void
42139 ix86_finish_cost (void *data, unsigned *prologue_cost,
42140 unsigned *body_cost, unsigned *epilogue_cost)
42142 unsigned *cost = (unsigned *) data;
42143 *prologue_cost = cost[vect_prologue];
42144 *body_cost = cost[vect_body];
42145 *epilogue_cost = cost[vect_epilogue];
42148 /* Implement targetm.vectorize.destroy_cost_data. */
42150 static void
42151 ix86_destroy_cost_data (void *data)
42153 free (data);
42156 /* Validate target specific memory model bits in VAL. */
42158 static unsigned HOST_WIDE_INT
42159 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
42161 unsigned HOST_WIDE_INT model = val & MEMMODEL_MASK;
42162 bool strong;
42164 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
42165 |MEMMODEL_MASK)
42166 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
42168 warning (OPT_Winvalid_memory_model,
42169 "Unknown architecture specific memory model");
42170 return MEMMODEL_SEQ_CST;
42172 strong = (model == MEMMODEL_ACQ_REL || model == MEMMODEL_SEQ_CST);
42173 if (val & IX86_HLE_ACQUIRE && !(model == MEMMODEL_ACQUIRE || strong))
42175 warning (OPT_Winvalid_memory_model,
42176 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
42177 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
42179 if (val & IX86_HLE_RELEASE && !(model == MEMMODEL_RELEASE || strong))
42181 warning (OPT_Winvalid_memory_model,
42182 "HLE_RELEASE not used with RELEASE or stronger memory model");
42183 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
42185 return val;
42188 /* Initialize the GCC target structure. */
42189 #undef TARGET_RETURN_IN_MEMORY
42190 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
42192 #undef TARGET_LEGITIMIZE_ADDRESS
42193 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
42195 #undef TARGET_ATTRIBUTE_TABLE
42196 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
42197 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
42198 # undef TARGET_MERGE_DECL_ATTRIBUTES
42199 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
42200 #endif
42202 #undef TARGET_COMP_TYPE_ATTRIBUTES
42203 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
42205 #undef TARGET_INIT_BUILTINS
42206 #define TARGET_INIT_BUILTINS ix86_init_builtins
42207 #undef TARGET_BUILTIN_DECL
42208 #define TARGET_BUILTIN_DECL ix86_builtin_decl
42209 #undef TARGET_EXPAND_BUILTIN
42210 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
42212 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
42213 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
42214 ix86_builtin_vectorized_function
42216 #undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
42217 #define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
42219 #undef TARGET_VECTORIZE_BUILTIN_TM_STORE
42220 #define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
42222 #undef TARGET_VECTORIZE_BUILTIN_GATHER
42223 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
42225 #undef TARGET_BUILTIN_RECIPROCAL
42226 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
42228 #undef TARGET_ASM_FUNCTION_EPILOGUE
42229 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
42231 #undef TARGET_ENCODE_SECTION_INFO
42232 #ifndef SUBTARGET_ENCODE_SECTION_INFO
42233 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
42234 #else
42235 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
42236 #endif
42238 #undef TARGET_ASM_OPEN_PAREN
42239 #define TARGET_ASM_OPEN_PAREN ""
42240 #undef TARGET_ASM_CLOSE_PAREN
42241 #define TARGET_ASM_CLOSE_PAREN ""
42243 #undef TARGET_ASM_BYTE_OP
42244 #define TARGET_ASM_BYTE_OP ASM_BYTE
42246 #undef TARGET_ASM_ALIGNED_HI_OP
42247 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
42248 #undef TARGET_ASM_ALIGNED_SI_OP
42249 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
42250 #ifdef ASM_QUAD
42251 #undef TARGET_ASM_ALIGNED_DI_OP
42252 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
42253 #endif
42255 #undef TARGET_PROFILE_BEFORE_PROLOGUE
42256 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
42258 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
42259 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
42261 #undef TARGET_ASM_UNALIGNED_HI_OP
42262 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
42263 #undef TARGET_ASM_UNALIGNED_SI_OP
42264 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
42265 #undef TARGET_ASM_UNALIGNED_DI_OP
42266 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
42268 #undef TARGET_PRINT_OPERAND
42269 #define TARGET_PRINT_OPERAND ix86_print_operand
42270 #undef TARGET_PRINT_OPERAND_ADDRESS
42271 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
42272 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
42273 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
42274 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
42275 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
42277 #undef TARGET_SCHED_INIT_GLOBAL
42278 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
42279 #undef TARGET_SCHED_ADJUST_COST
42280 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
42281 #undef TARGET_SCHED_ISSUE_RATE
42282 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
42283 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
42284 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
42285 ia32_multipass_dfa_lookahead
42287 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
42288 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
42290 #undef TARGET_MEMMODEL_CHECK
42291 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
42293 #ifdef HAVE_AS_TLS
42294 #undef TARGET_HAVE_TLS
42295 #define TARGET_HAVE_TLS true
42296 #endif
42297 #undef TARGET_CANNOT_FORCE_CONST_MEM
42298 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
42299 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
42300 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
42302 #undef TARGET_DELEGITIMIZE_ADDRESS
42303 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
42305 #undef TARGET_MS_BITFIELD_LAYOUT_P
42306 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
42308 #if TARGET_MACHO
42309 #undef TARGET_BINDS_LOCAL_P
42310 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
42311 #endif
42312 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
42313 #undef TARGET_BINDS_LOCAL_P
42314 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
42315 #endif
42317 #undef TARGET_ASM_OUTPUT_MI_THUNK
42318 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
42319 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
42320 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
42322 #undef TARGET_ASM_FILE_START
42323 #define TARGET_ASM_FILE_START x86_file_start
42325 #undef TARGET_OPTION_OVERRIDE
42326 #define TARGET_OPTION_OVERRIDE ix86_option_override
42328 #undef TARGET_REGISTER_MOVE_COST
42329 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
42330 #undef TARGET_MEMORY_MOVE_COST
42331 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
42332 #undef TARGET_RTX_COSTS
42333 #define TARGET_RTX_COSTS ix86_rtx_costs
42334 #undef TARGET_ADDRESS_COST
42335 #define TARGET_ADDRESS_COST ix86_address_cost
42337 #undef TARGET_FIXED_CONDITION_CODE_REGS
42338 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
42339 #undef TARGET_CC_MODES_COMPATIBLE
42340 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
42342 #undef TARGET_MACHINE_DEPENDENT_REORG
42343 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
42345 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
42346 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
42348 #undef TARGET_BUILD_BUILTIN_VA_LIST
42349 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
42351 #undef TARGET_FOLD_BUILTIN
42352 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
42354 #undef TARGET_COMPARE_VERSION_PRIORITY
42355 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
42357 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
42358 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
42359 ix86_generate_version_dispatcher_body
42361 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
42362 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
42363 ix86_get_function_versions_dispatcher
42365 #undef TARGET_ENUM_VA_LIST_P
42366 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
42368 #undef TARGET_FN_ABI_VA_LIST
42369 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
42371 #undef TARGET_CANONICAL_VA_LIST_TYPE
42372 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
42374 #undef TARGET_EXPAND_BUILTIN_VA_START
42375 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
42377 #undef TARGET_MD_ASM_CLOBBERS
42378 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
42380 #undef TARGET_PROMOTE_PROTOTYPES
42381 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
42382 #undef TARGET_STRUCT_VALUE_RTX
42383 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
42384 #undef TARGET_SETUP_INCOMING_VARARGS
42385 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
42386 #undef TARGET_MUST_PASS_IN_STACK
42387 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
42388 #undef TARGET_FUNCTION_ARG_ADVANCE
42389 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
42390 #undef TARGET_FUNCTION_ARG
42391 #define TARGET_FUNCTION_ARG ix86_function_arg
42392 #undef TARGET_FUNCTION_ARG_BOUNDARY
42393 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
42394 #undef TARGET_PASS_BY_REFERENCE
42395 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
42396 #undef TARGET_INTERNAL_ARG_POINTER
42397 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
42398 #undef TARGET_UPDATE_STACK_BOUNDARY
42399 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
42400 #undef TARGET_GET_DRAP_RTX
42401 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
42402 #undef TARGET_STRICT_ARGUMENT_NAMING
42403 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
42404 #undef TARGET_STATIC_CHAIN
42405 #define TARGET_STATIC_CHAIN ix86_static_chain
42406 #undef TARGET_TRAMPOLINE_INIT
42407 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
42408 #undef TARGET_RETURN_POPS_ARGS
42409 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
42411 #undef TARGET_LEGITIMATE_COMBINED_INSN
42412 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
42414 #undef TARGET_ASAN_SHADOW_OFFSET
42415 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
42417 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
42418 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
42420 #undef TARGET_SCALAR_MODE_SUPPORTED_P
42421 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
42423 #undef TARGET_VECTOR_MODE_SUPPORTED_P
42424 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
42426 #undef TARGET_C_MODE_FOR_SUFFIX
42427 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
42429 #ifdef HAVE_AS_TLS
42430 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
42431 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
42432 #endif
42434 #ifdef SUBTARGET_INSERT_ATTRIBUTES
42435 #undef TARGET_INSERT_ATTRIBUTES
42436 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
42437 #endif
42439 #undef TARGET_MANGLE_TYPE
42440 #define TARGET_MANGLE_TYPE ix86_mangle_type
42442 #if !TARGET_MACHO
42443 #undef TARGET_STACK_PROTECT_FAIL
42444 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
42445 #endif
42447 #undef TARGET_FUNCTION_VALUE
42448 #define TARGET_FUNCTION_VALUE ix86_function_value
42450 #undef TARGET_FUNCTION_VALUE_REGNO_P
42451 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
42453 #undef TARGET_PROMOTE_FUNCTION_MODE
42454 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
42456 #undef TARGET_MEMBER_TYPE_FORCES_BLK
42457 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
42459 #undef TARGET_INSTANTIATE_DECLS
42460 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
42462 #undef TARGET_SECONDARY_RELOAD
42463 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
42465 #undef TARGET_CLASS_MAX_NREGS
42466 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
42468 #undef TARGET_PREFERRED_RELOAD_CLASS
42469 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
42470 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
42471 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
42472 #undef TARGET_CLASS_LIKELY_SPILLED_P
42473 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
42475 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
42476 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
42477 ix86_builtin_vectorization_cost
42478 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
42479 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
42480 ix86_vectorize_vec_perm_const_ok
42481 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
42482 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
42483 ix86_preferred_simd_mode
42484 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
42485 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
42486 ix86_autovectorize_vector_sizes
42487 #undef TARGET_VECTORIZE_INIT_COST
42488 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
42489 #undef TARGET_VECTORIZE_ADD_STMT_COST
42490 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
42491 #undef TARGET_VECTORIZE_FINISH_COST
42492 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
42493 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
42494 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
42496 #undef TARGET_SET_CURRENT_FUNCTION
42497 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
42499 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
42500 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
42502 #undef TARGET_OPTION_SAVE
42503 #define TARGET_OPTION_SAVE ix86_function_specific_save
42505 #undef TARGET_OPTION_RESTORE
42506 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
42508 #undef TARGET_OPTION_PRINT
42509 #define TARGET_OPTION_PRINT ix86_function_specific_print
42511 #undef TARGET_OPTION_FUNCTION_VERSIONS
42512 #define TARGET_OPTION_FUNCTION_VERSIONS ix86_function_versions
42514 #undef TARGET_CAN_INLINE_P
42515 #define TARGET_CAN_INLINE_P ix86_can_inline_p
42517 #undef TARGET_EXPAND_TO_RTL_HOOK
42518 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
42520 #undef TARGET_LEGITIMATE_ADDRESS_P
42521 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
42523 #undef TARGET_LRA_P
42524 #define TARGET_LRA_P hook_bool_void_true
42526 #undef TARGET_REGISTER_PRIORITY
42527 #define TARGET_REGISTER_PRIORITY ix86_register_priority
42529 #undef TARGET_LEGITIMATE_CONSTANT_P
42530 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
42532 #undef TARGET_FRAME_POINTER_REQUIRED
42533 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
42535 #undef TARGET_CAN_ELIMINATE
42536 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
42538 #undef TARGET_EXTRA_LIVE_ON_ENTRY
42539 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
42541 #undef TARGET_ASM_CODE_END
42542 #define TARGET_ASM_CODE_END ix86_code_end
42544 #undef TARGET_CONDITIONAL_REGISTER_USAGE
42545 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
42547 #if TARGET_MACHO
42548 #undef TARGET_INIT_LIBFUNCS
42549 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
42550 #endif
42552 #undef TARGET_SPILL_CLASS
42553 #define TARGET_SPILL_CLASS ix86_spill_class
42555 struct gcc_target targetm = TARGET_INITIALIZER;
42557 #include "gt-i386.h"