Merge trunk version 195164 into gupc branch.
[official-gcc.git] / gcc / config / i386 / i386.c
blob6f0d0abf82c55720927e8cb0dc639e9f95d36e40
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2013 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tm.h"
24 #include "rtl.h"
25 #include "tree.h"
26 #include "tm_p.h"
27 #include "regs.h"
28 #include "hard-reg-set.h"
29 #include "insn-config.h"
30 #include "conditions.h"
31 #include "output.h"
32 #include "insn-codes.h"
33 #include "insn-attr.h"
34 #include "flags.h"
35 #include "except.h"
36 #include "function.h"
37 #include "recog.h"
38 #include "expr.h"
39 #include "optabs.h"
40 #include "diagnostic-core.h"
41 #include "toplev.h"
42 #include "basic-block.h"
43 #include "ggc.h"
44 #include "target.h"
45 #include "target-def.h"
46 #include "common/common-target.h"
47 #include "langhooks.h"
48 #include "reload.h"
49 #include "cgraph.h"
50 #include "gimple.h"
51 #include "dwarf2.h"
52 #include "df.h"
53 #include "tm-constrs.h"
54 #include "params.h"
55 #include "cselib.h"
56 #include "debug.h"
57 #include "sched-int.h"
58 #include "sbitmap.h"
59 #include "fibheap.h"
60 #include "opts.h"
61 #include "diagnostic.h"
62 #include "dumpfile.h"
63 #include "tree-pass.h"
64 #include "tree-flow.h"
66 static rtx legitimize_dllimport_symbol (rtx, bool);
68 #ifndef CHECK_STACK_LIMIT
69 #define CHECK_STACK_LIMIT (-1)
70 #endif
72 /* Return index of given mode in mult and division cost tables. */
73 #define MODE_INDEX(mode) \
74 ((mode) == QImode ? 0 \
75 : (mode) == HImode ? 1 \
76 : (mode) == SImode ? 2 \
77 : (mode) == DImode ? 3 \
78 : 4)
80 /* Processor costs (relative to an add) */
81 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
82 #define COSTS_N_BYTES(N) ((N) * 2)
84 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
86 const
87 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
88 COSTS_N_BYTES (2), /* cost of an add instruction */
89 COSTS_N_BYTES (3), /* cost of a lea instruction */
90 COSTS_N_BYTES (2), /* variable shift costs */
91 COSTS_N_BYTES (3), /* constant shift costs */
92 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
93 COSTS_N_BYTES (3), /* HI */
94 COSTS_N_BYTES (3), /* SI */
95 COSTS_N_BYTES (3), /* DI */
96 COSTS_N_BYTES (5)}, /* other */
97 0, /* cost of multiply per each bit set */
98 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
99 COSTS_N_BYTES (3), /* HI */
100 COSTS_N_BYTES (3), /* SI */
101 COSTS_N_BYTES (3), /* DI */
102 COSTS_N_BYTES (5)}, /* other */
103 COSTS_N_BYTES (3), /* cost of movsx */
104 COSTS_N_BYTES (3), /* cost of movzx */
105 0, /* "large" insn */
106 2, /* MOVE_RATIO */
107 2, /* cost for loading QImode using movzbl */
108 {2, 2, 2}, /* cost of loading integer registers
109 in QImode, HImode and SImode.
110 Relative to reg-reg move (2). */
111 {2, 2, 2}, /* cost of storing integer registers */
112 2, /* cost of reg,reg fld/fst */
113 {2, 2, 2}, /* cost of loading fp registers
114 in SFmode, DFmode and XFmode */
115 {2, 2, 2}, /* cost of storing fp registers
116 in SFmode, DFmode and XFmode */
117 3, /* cost of moving MMX register */
118 {3, 3}, /* cost of loading MMX registers
119 in SImode and DImode */
120 {3, 3}, /* cost of storing MMX registers
121 in SImode and DImode */
122 3, /* cost of moving SSE register */
123 {3, 3, 3}, /* cost of loading SSE registers
124 in SImode, DImode and TImode */
125 {3, 3, 3}, /* cost of storing SSE registers
126 in SImode, DImode and TImode */
127 3, /* MMX or SSE register to integer */
128 0, /* size of l1 cache */
129 0, /* size of l2 cache */
130 0, /* size of prefetch block */
131 0, /* number of parallel prefetches */
132 2, /* Branch cost */
133 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
134 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
135 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
136 COSTS_N_BYTES (2), /* cost of FABS instruction. */
137 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
138 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
139 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
140 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}},
141 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
142 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}},
143 1, /* scalar_stmt_cost. */
144 1, /* scalar load_cost. */
145 1, /* scalar_store_cost. */
146 1, /* vec_stmt_cost. */
147 1, /* vec_to_scalar_cost. */
148 1, /* scalar_to_vec_cost. */
149 1, /* vec_align_load_cost. */
150 1, /* vec_unalign_load_cost. */
151 1, /* vec_store_cost. */
152 1, /* cond_taken_branch_cost. */
153 1, /* cond_not_taken_branch_cost. */
156 /* Processor costs (relative to an add) */
157 static const
158 struct processor_costs i386_cost = { /* 386 specific costs */
159 COSTS_N_INSNS (1), /* cost of an add instruction */
160 COSTS_N_INSNS (1), /* cost of a lea instruction */
161 COSTS_N_INSNS (3), /* variable shift costs */
162 COSTS_N_INSNS (2), /* constant shift costs */
163 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
164 COSTS_N_INSNS (6), /* HI */
165 COSTS_N_INSNS (6), /* SI */
166 COSTS_N_INSNS (6), /* DI */
167 COSTS_N_INSNS (6)}, /* other */
168 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
169 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
170 COSTS_N_INSNS (23), /* HI */
171 COSTS_N_INSNS (23), /* SI */
172 COSTS_N_INSNS (23), /* DI */
173 COSTS_N_INSNS (23)}, /* other */
174 COSTS_N_INSNS (3), /* cost of movsx */
175 COSTS_N_INSNS (2), /* cost of movzx */
176 15, /* "large" insn */
177 3, /* MOVE_RATIO */
178 4, /* cost for loading QImode using movzbl */
179 {2, 4, 2}, /* cost of loading integer registers
180 in QImode, HImode and SImode.
181 Relative to reg-reg move (2). */
182 {2, 4, 2}, /* cost of storing integer registers */
183 2, /* cost of reg,reg fld/fst */
184 {8, 8, 8}, /* cost of loading fp registers
185 in SFmode, DFmode and XFmode */
186 {8, 8, 8}, /* cost of storing fp registers
187 in SFmode, DFmode and XFmode */
188 2, /* cost of moving MMX register */
189 {4, 8}, /* cost of loading MMX registers
190 in SImode and DImode */
191 {4, 8}, /* cost of storing MMX registers
192 in SImode and DImode */
193 2, /* cost of moving SSE register */
194 {4, 8, 16}, /* cost of loading SSE registers
195 in SImode, DImode and TImode */
196 {4, 8, 16}, /* cost of storing SSE registers
197 in SImode, DImode and TImode */
198 3, /* MMX or SSE register to integer */
199 0, /* size of l1 cache */
200 0, /* size of l2 cache */
201 0, /* size of prefetch block */
202 0, /* number of parallel prefetches */
203 1, /* Branch cost */
204 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
205 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
206 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
207 COSTS_N_INSNS (22), /* cost of FABS instruction. */
208 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
209 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
210 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
211 DUMMY_STRINGOP_ALGS},
212 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
213 DUMMY_STRINGOP_ALGS},
214 1, /* scalar_stmt_cost. */
215 1, /* scalar load_cost. */
216 1, /* scalar_store_cost. */
217 1, /* vec_stmt_cost. */
218 1, /* vec_to_scalar_cost. */
219 1, /* scalar_to_vec_cost. */
220 1, /* vec_align_load_cost. */
221 2, /* vec_unalign_load_cost. */
222 1, /* vec_store_cost. */
223 3, /* cond_taken_branch_cost. */
224 1, /* cond_not_taken_branch_cost. */
227 static const
228 struct processor_costs i486_cost = { /* 486 specific costs */
229 COSTS_N_INSNS (1), /* cost of an add instruction */
230 COSTS_N_INSNS (1), /* cost of a lea instruction */
231 COSTS_N_INSNS (3), /* variable shift costs */
232 COSTS_N_INSNS (2), /* constant shift costs */
233 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
234 COSTS_N_INSNS (12), /* HI */
235 COSTS_N_INSNS (12), /* SI */
236 COSTS_N_INSNS (12), /* DI */
237 COSTS_N_INSNS (12)}, /* other */
238 1, /* cost of multiply per each bit set */
239 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
240 COSTS_N_INSNS (40), /* HI */
241 COSTS_N_INSNS (40), /* SI */
242 COSTS_N_INSNS (40), /* DI */
243 COSTS_N_INSNS (40)}, /* other */
244 COSTS_N_INSNS (3), /* cost of movsx */
245 COSTS_N_INSNS (2), /* cost of movzx */
246 15, /* "large" insn */
247 3, /* MOVE_RATIO */
248 4, /* cost for loading QImode using movzbl */
249 {2, 4, 2}, /* cost of loading integer registers
250 in QImode, HImode and SImode.
251 Relative to reg-reg move (2). */
252 {2, 4, 2}, /* cost of storing integer registers */
253 2, /* cost of reg,reg fld/fst */
254 {8, 8, 8}, /* cost of loading fp registers
255 in SFmode, DFmode and XFmode */
256 {8, 8, 8}, /* cost of storing fp registers
257 in SFmode, DFmode and XFmode */
258 2, /* cost of moving MMX register */
259 {4, 8}, /* cost of loading MMX registers
260 in SImode and DImode */
261 {4, 8}, /* cost of storing MMX registers
262 in SImode and DImode */
263 2, /* cost of moving SSE register */
264 {4, 8, 16}, /* cost of loading SSE registers
265 in SImode, DImode and TImode */
266 {4, 8, 16}, /* cost of storing SSE registers
267 in SImode, DImode and TImode */
268 3, /* MMX or SSE register to integer */
269 4, /* size of l1 cache. 486 has 8kB cache
270 shared for code and data, so 4kB is
271 not really precise. */
272 4, /* size of l2 cache */
273 0, /* size of prefetch block */
274 0, /* number of parallel prefetches */
275 1, /* Branch cost */
276 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
277 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
278 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
279 COSTS_N_INSNS (3), /* cost of FABS instruction. */
280 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
281 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
282 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
283 DUMMY_STRINGOP_ALGS},
284 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
285 DUMMY_STRINGOP_ALGS},
286 1, /* scalar_stmt_cost. */
287 1, /* scalar load_cost. */
288 1, /* scalar_store_cost. */
289 1, /* vec_stmt_cost. */
290 1, /* vec_to_scalar_cost. */
291 1, /* scalar_to_vec_cost. */
292 1, /* vec_align_load_cost. */
293 2, /* vec_unalign_load_cost. */
294 1, /* vec_store_cost. */
295 3, /* cond_taken_branch_cost. */
296 1, /* cond_not_taken_branch_cost. */
299 static const
300 struct processor_costs pentium_cost = {
301 COSTS_N_INSNS (1), /* cost of an add instruction */
302 COSTS_N_INSNS (1), /* cost of a lea instruction */
303 COSTS_N_INSNS (4), /* variable shift costs */
304 COSTS_N_INSNS (1), /* constant shift costs */
305 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
306 COSTS_N_INSNS (11), /* HI */
307 COSTS_N_INSNS (11), /* SI */
308 COSTS_N_INSNS (11), /* DI */
309 COSTS_N_INSNS (11)}, /* other */
310 0, /* cost of multiply per each bit set */
311 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
312 COSTS_N_INSNS (25), /* HI */
313 COSTS_N_INSNS (25), /* SI */
314 COSTS_N_INSNS (25), /* DI */
315 COSTS_N_INSNS (25)}, /* other */
316 COSTS_N_INSNS (3), /* cost of movsx */
317 COSTS_N_INSNS (2), /* cost of movzx */
318 8, /* "large" insn */
319 6, /* MOVE_RATIO */
320 6, /* cost for loading QImode using movzbl */
321 {2, 4, 2}, /* cost of loading integer registers
322 in QImode, HImode and SImode.
323 Relative to reg-reg move (2). */
324 {2, 4, 2}, /* cost of storing integer registers */
325 2, /* cost of reg,reg fld/fst */
326 {2, 2, 6}, /* cost of loading fp registers
327 in SFmode, DFmode and XFmode */
328 {4, 4, 6}, /* cost of storing fp registers
329 in SFmode, DFmode and XFmode */
330 8, /* cost of moving MMX register */
331 {8, 8}, /* cost of loading MMX registers
332 in SImode and DImode */
333 {8, 8}, /* cost of storing MMX registers
334 in SImode and DImode */
335 2, /* cost of moving SSE register */
336 {4, 8, 16}, /* cost of loading SSE registers
337 in SImode, DImode and TImode */
338 {4, 8, 16}, /* cost of storing SSE registers
339 in SImode, DImode and TImode */
340 3, /* MMX or SSE register to integer */
341 8, /* size of l1 cache. */
342 8, /* size of l2 cache */
343 0, /* size of prefetch block */
344 0, /* number of parallel prefetches */
345 2, /* Branch cost */
346 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
347 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
348 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
349 COSTS_N_INSNS (1), /* cost of FABS instruction. */
350 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
351 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
352 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
353 DUMMY_STRINGOP_ALGS},
354 {{libcall, {{-1, rep_prefix_4_byte, false}}},
355 DUMMY_STRINGOP_ALGS},
356 1, /* scalar_stmt_cost. */
357 1, /* scalar load_cost. */
358 1, /* scalar_store_cost. */
359 1, /* vec_stmt_cost. */
360 1, /* vec_to_scalar_cost. */
361 1, /* scalar_to_vec_cost. */
362 1, /* vec_align_load_cost. */
363 2, /* vec_unalign_load_cost. */
364 1, /* vec_store_cost. */
365 3, /* cond_taken_branch_cost. */
366 1, /* cond_not_taken_branch_cost. */
369 static const
370 struct processor_costs pentiumpro_cost = {
371 COSTS_N_INSNS (1), /* cost of an add instruction */
372 COSTS_N_INSNS (1), /* cost of a lea instruction */
373 COSTS_N_INSNS (1), /* variable shift costs */
374 COSTS_N_INSNS (1), /* constant shift costs */
375 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
376 COSTS_N_INSNS (4), /* HI */
377 COSTS_N_INSNS (4), /* SI */
378 COSTS_N_INSNS (4), /* DI */
379 COSTS_N_INSNS (4)}, /* other */
380 0, /* cost of multiply per each bit set */
381 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
382 COSTS_N_INSNS (17), /* HI */
383 COSTS_N_INSNS (17), /* SI */
384 COSTS_N_INSNS (17), /* DI */
385 COSTS_N_INSNS (17)}, /* other */
386 COSTS_N_INSNS (1), /* cost of movsx */
387 COSTS_N_INSNS (1), /* cost of movzx */
388 8, /* "large" insn */
389 6, /* MOVE_RATIO */
390 2, /* cost for loading QImode using movzbl */
391 {4, 4, 4}, /* cost of loading integer registers
392 in QImode, HImode and SImode.
393 Relative to reg-reg move (2). */
394 {2, 2, 2}, /* cost of storing integer registers */
395 2, /* cost of reg,reg fld/fst */
396 {2, 2, 6}, /* cost of loading fp registers
397 in SFmode, DFmode and XFmode */
398 {4, 4, 6}, /* cost of storing fp registers
399 in SFmode, DFmode and XFmode */
400 2, /* cost of moving MMX register */
401 {2, 2}, /* cost of loading MMX registers
402 in SImode and DImode */
403 {2, 2}, /* cost of storing MMX registers
404 in SImode and DImode */
405 2, /* cost of moving SSE register */
406 {2, 2, 8}, /* cost of loading SSE registers
407 in SImode, DImode and TImode */
408 {2, 2, 8}, /* cost of storing SSE registers
409 in SImode, DImode and TImode */
410 3, /* MMX or SSE register to integer */
411 8, /* size of l1 cache. */
412 256, /* size of l2 cache */
413 32, /* size of prefetch block */
414 6, /* number of parallel prefetches */
415 2, /* Branch cost */
416 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
417 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
418 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
419 COSTS_N_INSNS (2), /* cost of FABS instruction. */
420 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
421 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
422 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
423 (we ensure the alignment). For small blocks inline loop is still a
424 noticeable win, for bigger blocks either rep movsl or rep movsb is
425 way to go. Rep movsb has apparently more expensive startup time in CPU,
426 but after 4K the difference is down in the noise. */
427 {{rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
428 {8192, rep_prefix_4_byte, false},
429 {-1, rep_prefix_1_byte, false}}},
430 DUMMY_STRINGOP_ALGS},
431 {{rep_prefix_4_byte, {{1024, unrolled_loop, false},
432 {8192, rep_prefix_4_byte, false},
433 {-1, libcall, false}}},
434 DUMMY_STRINGOP_ALGS},
435 1, /* scalar_stmt_cost. */
436 1, /* scalar load_cost. */
437 1, /* scalar_store_cost. */
438 1, /* vec_stmt_cost. */
439 1, /* vec_to_scalar_cost. */
440 1, /* scalar_to_vec_cost. */
441 1, /* vec_align_load_cost. */
442 2, /* vec_unalign_load_cost. */
443 1, /* vec_store_cost. */
444 3, /* cond_taken_branch_cost. */
445 1, /* cond_not_taken_branch_cost. */
448 static const
449 struct processor_costs geode_cost = {
450 COSTS_N_INSNS (1), /* cost of an add instruction */
451 COSTS_N_INSNS (1), /* cost of a lea instruction */
452 COSTS_N_INSNS (2), /* variable shift costs */
453 COSTS_N_INSNS (1), /* constant shift costs */
454 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
455 COSTS_N_INSNS (4), /* HI */
456 COSTS_N_INSNS (7), /* SI */
457 COSTS_N_INSNS (7), /* DI */
458 COSTS_N_INSNS (7)}, /* other */
459 0, /* cost of multiply per each bit set */
460 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
461 COSTS_N_INSNS (23), /* HI */
462 COSTS_N_INSNS (39), /* SI */
463 COSTS_N_INSNS (39), /* DI */
464 COSTS_N_INSNS (39)}, /* other */
465 COSTS_N_INSNS (1), /* cost of movsx */
466 COSTS_N_INSNS (1), /* cost of movzx */
467 8, /* "large" insn */
468 4, /* MOVE_RATIO */
469 1, /* cost for loading QImode using movzbl */
470 {1, 1, 1}, /* cost of loading integer registers
471 in QImode, HImode and SImode.
472 Relative to reg-reg move (2). */
473 {1, 1, 1}, /* cost of storing integer registers */
474 1, /* cost of reg,reg fld/fst */
475 {1, 1, 1}, /* cost of loading fp registers
476 in SFmode, DFmode and XFmode */
477 {4, 6, 6}, /* cost of storing fp registers
478 in SFmode, DFmode and XFmode */
480 1, /* cost of moving MMX register */
481 {1, 1}, /* cost of loading MMX registers
482 in SImode and DImode */
483 {1, 1}, /* cost of storing MMX registers
484 in SImode and DImode */
485 1, /* cost of moving SSE register */
486 {1, 1, 1}, /* cost of loading SSE registers
487 in SImode, DImode and TImode */
488 {1, 1, 1}, /* cost of storing SSE registers
489 in SImode, DImode and TImode */
490 1, /* MMX or SSE register to integer */
491 64, /* size of l1 cache. */
492 128, /* size of l2 cache. */
493 32, /* size of prefetch block */
494 1, /* number of parallel prefetches */
495 1, /* Branch cost */
496 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
497 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
498 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
499 COSTS_N_INSNS (1), /* cost of FABS instruction. */
500 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
501 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
502 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
503 DUMMY_STRINGOP_ALGS},
504 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
505 DUMMY_STRINGOP_ALGS},
506 1, /* scalar_stmt_cost. */
507 1, /* scalar load_cost. */
508 1, /* scalar_store_cost. */
509 1, /* vec_stmt_cost. */
510 1, /* vec_to_scalar_cost. */
511 1, /* scalar_to_vec_cost. */
512 1, /* vec_align_load_cost. */
513 2, /* vec_unalign_load_cost. */
514 1, /* vec_store_cost. */
515 3, /* cond_taken_branch_cost. */
516 1, /* cond_not_taken_branch_cost. */
519 static const
520 struct processor_costs k6_cost = {
521 COSTS_N_INSNS (1), /* cost of an add instruction */
522 COSTS_N_INSNS (2), /* cost of a lea instruction */
523 COSTS_N_INSNS (1), /* variable shift costs */
524 COSTS_N_INSNS (1), /* constant shift costs */
525 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
526 COSTS_N_INSNS (3), /* HI */
527 COSTS_N_INSNS (3), /* SI */
528 COSTS_N_INSNS (3), /* DI */
529 COSTS_N_INSNS (3)}, /* other */
530 0, /* cost of multiply per each bit set */
531 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
532 COSTS_N_INSNS (18), /* HI */
533 COSTS_N_INSNS (18), /* SI */
534 COSTS_N_INSNS (18), /* DI */
535 COSTS_N_INSNS (18)}, /* other */
536 COSTS_N_INSNS (2), /* cost of movsx */
537 COSTS_N_INSNS (2), /* cost of movzx */
538 8, /* "large" insn */
539 4, /* MOVE_RATIO */
540 3, /* cost for loading QImode using movzbl */
541 {4, 5, 4}, /* cost of loading integer registers
542 in QImode, HImode and SImode.
543 Relative to reg-reg move (2). */
544 {2, 3, 2}, /* cost of storing integer registers */
545 4, /* cost of reg,reg fld/fst */
546 {6, 6, 6}, /* cost of loading fp registers
547 in SFmode, DFmode and XFmode */
548 {4, 4, 4}, /* cost of storing fp registers
549 in SFmode, DFmode and XFmode */
550 2, /* cost of moving MMX register */
551 {2, 2}, /* cost of loading MMX registers
552 in SImode and DImode */
553 {2, 2}, /* cost of storing MMX registers
554 in SImode and DImode */
555 2, /* cost of moving SSE register */
556 {2, 2, 8}, /* cost of loading SSE registers
557 in SImode, DImode and TImode */
558 {2, 2, 8}, /* cost of storing SSE registers
559 in SImode, DImode and TImode */
560 6, /* MMX or SSE register to integer */
561 32, /* size of l1 cache. */
562 32, /* size of l2 cache. Some models
563 have integrated l2 cache, but
564 optimizing for k6 is not important
565 enough to worry about that. */
566 32, /* size of prefetch block */
567 1, /* number of parallel prefetches */
568 1, /* Branch cost */
569 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
570 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
571 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
572 COSTS_N_INSNS (2), /* cost of FABS instruction. */
573 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
574 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
575 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
576 DUMMY_STRINGOP_ALGS},
577 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
578 DUMMY_STRINGOP_ALGS},
579 1, /* scalar_stmt_cost. */
580 1, /* scalar load_cost. */
581 1, /* scalar_store_cost. */
582 1, /* vec_stmt_cost. */
583 1, /* vec_to_scalar_cost. */
584 1, /* scalar_to_vec_cost. */
585 1, /* vec_align_load_cost. */
586 2, /* vec_unalign_load_cost. */
587 1, /* vec_store_cost. */
588 3, /* cond_taken_branch_cost. */
589 1, /* cond_not_taken_branch_cost. */
592 static const
593 struct processor_costs athlon_cost = {
594 COSTS_N_INSNS (1), /* cost of an add instruction */
595 COSTS_N_INSNS (2), /* cost of a lea instruction */
596 COSTS_N_INSNS (1), /* variable shift costs */
597 COSTS_N_INSNS (1), /* constant shift costs */
598 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
599 COSTS_N_INSNS (5), /* HI */
600 COSTS_N_INSNS (5), /* SI */
601 COSTS_N_INSNS (5), /* DI */
602 COSTS_N_INSNS (5)}, /* other */
603 0, /* cost of multiply per each bit set */
604 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
605 COSTS_N_INSNS (26), /* HI */
606 COSTS_N_INSNS (42), /* SI */
607 COSTS_N_INSNS (74), /* DI */
608 COSTS_N_INSNS (74)}, /* other */
609 COSTS_N_INSNS (1), /* cost of movsx */
610 COSTS_N_INSNS (1), /* cost of movzx */
611 8, /* "large" insn */
612 9, /* MOVE_RATIO */
613 4, /* cost for loading QImode using movzbl */
614 {3, 4, 3}, /* cost of loading integer registers
615 in QImode, HImode and SImode.
616 Relative to reg-reg move (2). */
617 {3, 4, 3}, /* cost of storing integer registers */
618 4, /* cost of reg,reg fld/fst */
619 {4, 4, 12}, /* cost of loading fp registers
620 in SFmode, DFmode and XFmode */
621 {6, 6, 8}, /* cost of storing fp registers
622 in SFmode, DFmode and XFmode */
623 2, /* cost of moving MMX register */
624 {4, 4}, /* cost of loading MMX registers
625 in SImode and DImode */
626 {4, 4}, /* cost of storing MMX registers
627 in SImode and DImode */
628 2, /* cost of moving SSE register */
629 {4, 4, 6}, /* cost of loading SSE registers
630 in SImode, DImode and TImode */
631 {4, 4, 5}, /* cost of storing SSE registers
632 in SImode, DImode and TImode */
633 5, /* MMX or SSE register to integer */
634 64, /* size of l1 cache. */
635 256, /* size of l2 cache. */
636 64, /* size of prefetch block */
637 6, /* number of parallel prefetches */
638 5, /* Branch cost */
639 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
640 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
641 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
642 COSTS_N_INSNS (2), /* cost of FABS instruction. */
643 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
644 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
645 /* For some reason, Athlon deals better with REP prefix (relative to loops)
646 compared to K8. Alignment becomes important after 8 bytes for memcpy and
647 128 bytes for memset. */
648 {{libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
649 DUMMY_STRINGOP_ALGS},
650 {{libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
651 DUMMY_STRINGOP_ALGS},
652 1, /* scalar_stmt_cost. */
653 1, /* scalar load_cost. */
654 1, /* scalar_store_cost. */
655 1, /* vec_stmt_cost. */
656 1, /* vec_to_scalar_cost. */
657 1, /* scalar_to_vec_cost. */
658 1, /* vec_align_load_cost. */
659 2, /* vec_unalign_load_cost. */
660 1, /* vec_store_cost. */
661 3, /* cond_taken_branch_cost. */
662 1, /* cond_not_taken_branch_cost. */
665 static const
666 struct processor_costs k8_cost = {
667 COSTS_N_INSNS (1), /* cost of an add instruction */
668 COSTS_N_INSNS (2), /* cost of a lea instruction */
669 COSTS_N_INSNS (1), /* variable shift costs */
670 COSTS_N_INSNS (1), /* constant shift costs */
671 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
672 COSTS_N_INSNS (4), /* HI */
673 COSTS_N_INSNS (3), /* SI */
674 COSTS_N_INSNS (4), /* DI */
675 COSTS_N_INSNS (5)}, /* other */
676 0, /* cost of multiply per each bit set */
677 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
678 COSTS_N_INSNS (26), /* HI */
679 COSTS_N_INSNS (42), /* SI */
680 COSTS_N_INSNS (74), /* DI */
681 COSTS_N_INSNS (74)}, /* other */
682 COSTS_N_INSNS (1), /* cost of movsx */
683 COSTS_N_INSNS (1), /* cost of movzx */
684 8, /* "large" insn */
685 9, /* MOVE_RATIO */
686 4, /* cost for loading QImode using movzbl */
687 {3, 4, 3}, /* cost of loading integer registers
688 in QImode, HImode and SImode.
689 Relative to reg-reg move (2). */
690 {3, 4, 3}, /* cost of storing integer registers */
691 4, /* cost of reg,reg fld/fst */
692 {4, 4, 12}, /* cost of loading fp registers
693 in SFmode, DFmode and XFmode */
694 {6, 6, 8}, /* cost of storing fp registers
695 in SFmode, DFmode and XFmode */
696 2, /* cost of moving MMX register */
697 {3, 3}, /* cost of loading MMX registers
698 in SImode and DImode */
699 {4, 4}, /* cost of storing MMX registers
700 in SImode and DImode */
701 2, /* cost of moving SSE register */
702 {4, 3, 6}, /* cost of loading SSE registers
703 in SImode, DImode and TImode */
704 {4, 4, 5}, /* cost of storing SSE registers
705 in SImode, DImode and TImode */
706 5, /* MMX or SSE register to integer */
707 64, /* size of l1 cache. */
708 512, /* size of l2 cache. */
709 64, /* size of prefetch block */
710 /* New AMD processors never drop prefetches; if they cannot be performed
711 immediately, they are queued. We set number of simultaneous prefetches
712 to a large constant to reflect this (it probably is not a good idea not
713 to limit number of prefetches at all, as their execution also takes some
714 time). */
715 100, /* number of parallel prefetches */
716 3, /* Branch cost */
717 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
718 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
719 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
720 COSTS_N_INSNS (2), /* cost of FABS instruction. */
721 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
722 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
723 /* K8 has optimized REP instruction for medium sized blocks, but for very
724 small blocks it is better to use loop. For large blocks, libcall can
725 do nontemporary accesses and beat inline considerably. */
726 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
727 {-1, rep_prefix_4_byte, false}}},
728 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
729 {-1, libcall, false}}}},
730 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
731 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
732 {libcall, {{48, unrolled_loop, false},
733 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
734 4, /* scalar_stmt_cost. */
735 2, /* scalar load_cost. */
736 2, /* scalar_store_cost. */
737 5, /* vec_stmt_cost. */
738 0, /* vec_to_scalar_cost. */
739 2, /* scalar_to_vec_cost. */
740 2, /* vec_align_load_cost. */
741 3, /* vec_unalign_load_cost. */
742 3, /* vec_store_cost. */
743 3, /* cond_taken_branch_cost. */
744 2, /* cond_not_taken_branch_cost. */
747 struct processor_costs amdfam10_cost = {
748 COSTS_N_INSNS (1), /* cost of an add instruction */
749 COSTS_N_INSNS (2), /* cost of a lea instruction */
750 COSTS_N_INSNS (1), /* variable shift costs */
751 COSTS_N_INSNS (1), /* constant shift costs */
752 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
753 COSTS_N_INSNS (4), /* HI */
754 COSTS_N_INSNS (3), /* SI */
755 COSTS_N_INSNS (4), /* DI */
756 COSTS_N_INSNS (5)}, /* other */
757 0, /* cost of multiply per each bit set */
758 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
759 COSTS_N_INSNS (35), /* HI */
760 COSTS_N_INSNS (51), /* SI */
761 COSTS_N_INSNS (83), /* DI */
762 COSTS_N_INSNS (83)}, /* other */
763 COSTS_N_INSNS (1), /* cost of movsx */
764 COSTS_N_INSNS (1), /* cost of movzx */
765 8, /* "large" insn */
766 9, /* MOVE_RATIO */
767 4, /* cost for loading QImode using movzbl */
768 {3, 4, 3}, /* cost of loading integer registers
769 in QImode, HImode and SImode.
770 Relative to reg-reg move (2). */
771 {3, 4, 3}, /* cost of storing integer registers */
772 4, /* cost of reg,reg fld/fst */
773 {4, 4, 12}, /* cost of loading fp registers
774 in SFmode, DFmode and XFmode */
775 {6, 6, 8}, /* cost of storing fp registers
776 in SFmode, DFmode and XFmode */
777 2, /* cost of moving MMX register */
778 {3, 3}, /* cost of loading MMX registers
779 in SImode and DImode */
780 {4, 4}, /* cost of storing MMX registers
781 in SImode and DImode */
782 2, /* cost of moving SSE register */
783 {4, 4, 3}, /* cost of loading SSE registers
784 in SImode, DImode and TImode */
785 {4, 4, 5}, /* cost of storing SSE registers
786 in SImode, DImode and TImode */
787 3, /* MMX or SSE register to integer */
788 /* On K8:
789 MOVD reg64, xmmreg Double FSTORE 4
790 MOVD reg32, xmmreg Double FSTORE 4
791 On AMDFAM10:
792 MOVD reg64, xmmreg Double FADD 3
793 1/1 1/1
794 MOVD reg32, xmmreg Double FADD 3
795 1/1 1/1 */
796 64, /* size of l1 cache. */
797 512, /* size of l2 cache. */
798 64, /* size of prefetch block */
799 /* New AMD processors never drop prefetches; if they cannot be performed
800 immediately, they are queued. We set number of simultaneous prefetches
801 to a large constant to reflect this (it probably is not a good idea not
802 to limit number of prefetches at all, as their execution also takes some
803 time). */
804 100, /* number of parallel prefetches */
805 2, /* Branch cost */
806 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
807 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
808 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
809 COSTS_N_INSNS (2), /* cost of FABS instruction. */
810 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
811 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
813 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
814 very small blocks it is better to use loop. For large blocks, libcall can
815 do nontemporary accesses and beat inline considerably. */
816 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
817 {-1, rep_prefix_4_byte, false}}},
818 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
819 {-1, libcall, false}}}},
820 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
821 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
822 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
823 {-1, libcall, false}}}},
824 4, /* scalar_stmt_cost. */
825 2, /* scalar load_cost. */
826 2, /* scalar_store_cost. */
827 6, /* vec_stmt_cost. */
828 0, /* vec_to_scalar_cost. */
829 2, /* scalar_to_vec_cost. */
830 2, /* vec_align_load_cost. */
831 2, /* vec_unalign_load_cost. */
832 2, /* vec_store_cost. */
833 2, /* cond_taken_branch_cost. */
834 1, /* cond_not_taken_branch_cost. */
837 struct processor_costs bdver1_cost = {
838 COSTS_N_INSNS (1), /* cost of an add instruction */
839 COSTS_N_INSNS (1), /* cost of a lea instruction */
840 COSTS_N_INSNS (1), /* variable shift costs */
841 COSTS_N_INSNS (1), /* constant shift costs */
842 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
843 COSTS_N_INSNS (4), /* HI */
844 COSTS_N_INSNS (4), /* SI */
845 COSTS_N_INSNS (6), /* DI */
846 COSTS_N_INSNS (6)}, /* other */
847 0, /* cost of multiply per each bit set */
848 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
849 COSTS_N_INSNS (35), /* HI */
850 COSTS_N_INSNS (51), /* SI */
851 COSTS_N_INSNS (83), /* DI */
852 COSTS_N_INSNS (83)}, /* other */
853 COSTS_N_INSNS (1), /* cost of movsx */
854 COSTS_N_INSNS (1), /* cost of movzx */
855 8, /* "large" insn */
856 9, /* MOVE_RATIO */
857 4, /* cost for loading QImode using movzbl */
858 {5, 5, 4}, /* cost of loading integer registers
859 in QImode, HImode and SImode.
860 Relative to reg-reg move (2). */
861 {4, 4, 4}, /* cost of storing integer registers */
862 2, /* cost of reg,reg fld/fst */
863 {5, 5, 12}, /* cost of loading fp registers
864 in SFmode, DFmode and XFmode */
865 {4, 4, 8}, /* cost of storing fp registers
866 in SFmode, DFmode and XFmode */
867 2, /* cost of moving MMX register */
868 {4, 4}, /* cost of loading MMX registers
869 in SImode and DImode */
870 {4, 4}, /* cost of storing MMX registers
871 in SImode and DImode */
872 2, /* cost of moving SSE register */
873 {4, 4, 4}, /* cost of loading SSE registers
874 in SImode, DImode and TImode */
875 {4, 4, 4}, /* cost of storing SSE registers
876 in SImode, DImode and TImode */
877 2, /* MMX or SSE register to integer */
878 /* On K8:
879 MOVD reg64, xmmreg Double FSTORE 4
880 MOVD reg32, xmmreg Double FSTORE 4
881 On AMDFAM10:
882 MOVD reg64, xmmreg Double FADD 3
883 1/1 1/1
884 MOVD reg32, xmmreg Double FADD 3
885 1/1 1/1 */
886 16, /* size of l1 cache. */
887 2048, /* size of l2 cache. */
888 64, /* size of prefetch block */
889 /* New AMD processors never drop prefetches; if they cannot be performed
890 immediately, they are queued. We set number of simultaneous prefetches
891 to a large constant to reflect this (it probably is not a good idea not
892 to limit number of prefetches at all, as their execution also takes some
893 time). */
894 100, /* number of parallel prefetches */
895 2, /* Branch cost */
896 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
897 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
898 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
899 COSTS_N_INSNS (2), /* cost of FABS instruction. */
900 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
901 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
903 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
904 very small blocks it is better to use loop. For large blocks, libcall
905 can do nontemporary accesses and beat inline considerably. */
906 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
907 {-1, rep_prefix_4_byte, false}}},
908 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
909 {-1, libcall, false}}}},
910 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
911 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
912 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
913 {-1, libcall, false}}}},
914 6, /* scalar_stmt_cost. */
915 4, /* scalar load_cost. */
916 4, /* scalar_store_cost. */
917 6, /* vec_stmt_cost. */
918 0, /* vec_to_scalar_cost. */
919 2, /* scalar_to_vec_cost. */
920 4, /* vec_align_load_cost. */
921 4, /* vec_unalign_load_cost. */
922 4, /* vec_store_cost. */
923 2, /* cond_taken_branch_cost. */
924 1, /* cond_not_taken_branch_cost. */
927 struct processor_costs bdver2_cost = {
928 COSTS_N_INSNS (1), /* cost of an add instruction */
929 COSTS_N_INSNS (1), /* cost of a lea instruction */
930 COSTS_N_INSNS (1), /* variable shift costs */
931 COSTS_N_INSNS (1), /* constant shift costs */
932 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
933 COSTS_N_INSNS (4), /* HI */
934 COSTS_N_INSNS (4), /* SI */
935 COSTS_N_INSNS (6), /* DI */
936 COSTS_N_INSNS (6)}, /* other */
937 0, /* cost of multiply per each bit set */
938 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
939 COSTS_N_INSNS (35), /* HI */
940 COSTS_N_INSNS (51), /* SI */
941 COSTS_N_INSNS (83), /* DI */
942 COSTS_N_INSNS (83)}, /* other */
943 COSTS_N_INSNS (1), /* cost of movsx */
944 COSTS_N_INSNS (1), /* cost of movzx */
945 8, /* "large" insn */
946 9, /* MOVE_RATIO */
947 4, /* cost for loading QImode using movzbl */
948 {5, 5, 4}, /* cost of loading integer registers
949 in QImode, HImode and SImode.
950 Relative to reg-reg move (2). */
951 {4, 4, 4}, /* cost of storing integer registers */
952 2, /* cost of reg,reg fld/fst */
953 {5, 5, 12}, /* cost of loading fp registers
954 in SFmode, DFmode and XFmode */
955 {4, 4, 8}, /* cost of storing fp registers
956 in SFmode, DFmode and XFmode */
957 2, /* cost of moving MMX register */
958 {4, 4}, /* cost of loading MMX registers
959 in SImode and DImode */
960 {4, 4}, /* cost of storing MMX registers
961 in SImode and DImode */
962 2, /* cost of moving SSE register */
963 {4, 4, 4}, /* cost of loading SSE registers
964 in SImode, DImode and TImode */
965 {4, 4, 4}, /* cost of storing SSE registers
966 in SImode, DImode and TImode */
967 2, /* MMX or SSE register to integer */
968 /* On K8:
969 MOVD reg64, xmmreg Double FSTORE 4
970 MOVD reg32, xmmreg Double FSTORE 4
971 On AMDFAM10:
972 MOVD reg64, xmmreg Double FADD 3
973 1/1 1/1
974 MOVD reg32, xmmreg Double FADD 3
975 1/1 1/1 */
976 16, /* size of l1 cache. */
977 2048, /* size of l2 cache. */
978 64, /* size of prefetch block */
979 /* New AMD processors never drop prefetches; if they cannot be performed
980 immediately, they are queued. We set number of simultaneous prefetches
981 to a large constant to reflect this (it probably is not a good idea not
982 to limit number of prefetches at all, as their execution also takes some
983 time). */
984 100, /* number of parallel prefetches */
985 2, /* Branch cost */
986 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
987 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
988 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
989 COSTS_N_INSNS (2), /* cost of FABS instruction. */
990 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
991 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
993 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
994 very small blocks it is better to use loop. For large blocks, libcall
995 can do nontemporary accesses and beat inline considerably. */
996 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
997 {-1, rep_prefix_4_byte, false}}},
998 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
999 {-1, libcall, false}}}},
1000 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
1001 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1002 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1003 {-1, libcall, false}}}},
1004 6, /* scalar_stmt_cost. */
1005 4, /* scalar load_cost. */
1006 4, /* scalar_store_cost. */
1007 6, /* vec_stmt_cost. */
1008 0, /* vec_to_scalar_cost. */
1009 2, /* scalar_to_vec_cost. */
1010 4, /* vec_align_load_cost. */
1011 4, /* vec_unalign_load_cost. */
1012 4, /* vec_store_cost. */
1013 2, /* cond_taken_branch_cost. */
1014 1, /* cond_not_taken_branch_cost. */
1017 struct processor_costs bdver3_cost = {
1018 COSTS_N_INSNS (1), /* cost of an add instruction */
1019 COSTS_N_INSNS (1), /* cost of a lea instruction */
1020 COSTS_N_INSNS (1), /* variable shift costs */
1021 COSTS_N_INSNS (1), /* constant shift costs */
1022 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1023 COSTS_N_INSNS (4), /* HI */
1024 COSTS_N_INSNS (4), /* SI */
1025 COSTS_N_INSNS (6), /* DI */
1026 COSTS_N_INSNS (6)}, /* other */
1027 0, /* cost of multiply per each bit set */
1028 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1029 COSTS_N_INSNS (35), /* HI */
1030 COSTS_N_INSNS (51), /* SI */
1031 COSTS_N_INSNS (83), /* DI */
1032 COSTS_N_INSNS (83)}, /* other */
1033 COSTS_N_INSNS (1), /* cost of movsx */
1034 COSTS_N_INSNS (1), /* cost of movzx */
1035 8, /* "large" insn */
1036 9, /* MOVE_RATIO */
1037 4, /* cost for loading QImode using movzbl */
1038 {5, 5, 4}, /* cost of loading integer registers
1039 in QImode, HImode and SImode.
1040 Relative to reg-reg move (2). */
1041 {4, 4, 4}, /* cost of storing integer registers */
1042 2, /* cost of reg,reg fld/fst */
1043 {5, 5, 12}, /* cost of loading fp registers
1044 in SFmode, DFmode and XFmode */
1045 {4, 4, 8}, /* cost of storing fp registers
1046 in SFmode, DFmode and XFmode */
1047 2, /* cost of moving MMX register */
1048 {4, 4}, /* cost of loading MMX registers
1049 in SImode and DImode */
1050 {4, 4}, /* cost of storing MMX registers
1051 in SImode and DImode */
1052 2, /* cost of moving SSE register */
1053 {4, 4, 4}, /* cost of loading SSE registers
1054 in SImode, DImode and TImode */
1055 {4, 4, 4}, /* cost of storing SSE registers
1056 in SImode, DImode and TImode */
1057 2, /* MMX or SSE register to integer */
1058 16, /* size of l1 cache. */
1059 2048, /* size of l2 cache. */
1060 64, /* size of prefetch block */
1061 /* New AMD processors never drop prefetches; if they cannot be performed
1062 immediately, they are queued. We set number of simultaneous prefetches
1063 to a large constant to reflect this (it probably is not a good idea not
1064 to limit number of prefetches at all, as their execution also takes some
1065 time). */
1066 100, /* number of parallel prefetches */
1067 2, /* Branch cost */
1068 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1069 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1070 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1071 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1072 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1073 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1075 /* BDVER3 has optimized REP instruction for medium sized blocks, but for
1076 very small blocks it is better to use loop. For large blocks, libcall
1077 can do nontemporary accesses and beat inline considerably. */
1078 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
1079 {-1, rep_prefix_4_byte, false}}},
1080 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1081 {-1, libcall, false}}}},
1082 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
1083 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1084 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1085 {-1, libcall, false}}}},
1086 6, /* scalar_stmt_cost. */
1087 4, /* scalar load_cost. */
1088 4, /* scalar_store_cost. */
1089 6, /* vec_stmt_cost. */
1090 0, /* vec_to_scalar_cost. */
1091 2, /* scalar_to_vec_cost. */
1092 4, /* vec_align_load_cost. */
1093 4, /* vec_unalign_load_cost. */
1094 4, /* vec_store_cost. */
1095 2, /* cond_taken_branch_cost. */
1096 1, /* cond_not_taken_branch_cost. */
1099 struct processor_costs btver1_cost = {
1100 COSTS_N_INSNS (1), /* cost of an add instruction */
1101 COSTS_N_INSNS (2), /* cost of a lea instruction */
1102 COSTS_N_INSNS (1), /* variable shift costs */
1103 COSTS_N_INSNS (1), /* constant shift costs */
1104 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1105 COSTS_N_INSNS (4), /* HI */
1106 COSTS_N_INSNS (3), /* SI */
1107 COSTS_N_INSNS (4), /* DI */
1108 COSTS_N_INSNS (5)}, /* other */
1109 0, /* cost of multiply per each bit set */
1110 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1111 COSTS_N_INSNS (35), /* HI */
1112 COSTS_N_INSNS (51), /* SI */
1113 COSTS_N_INSNS (83), /* DI */
1114 COSTS_N_INSNS (83)}, /* other */
1115 COSTS_N_INSNS (1), /* cost of movsx */
1116 COSTS_N_INSNS (1), /* cost of movzx */
1117 8, /* "large" insn */
1118 9, /* MOVE_RATIO */
1119 4, /* cost for loading QImode using movzbl */
1120 {3, 4, 3}, /* cost of loading integer registers
1121 in QImode, HImode and SImode.
1122 Relative to reg-reg move (2). */
1123 {3, 4, 3}, /* cost of storing integer registers */
1124 4, /* cost of reg,reg fld/fst */
1125 {4, 4, 12}, /* cost of loading fp registers
1126 in SFmode, DFmode and XFmode */
1127 {6, 6, 8}, /* cost of storing fp registers
1128 in SFmode, DFmode and XFmode */
1129 2, /* cost of moving MMX register */
1130 {3, 3}, /* cost of loading MMX registers
1131 in SImode and DImode */
1132 {4, 4}, /* cost of storing MMX registers
1133 in SImode and DImode */
1134 2, /* cost of moving SSE register */
1135 {4, 4, 3}, /* cost of loading SSE registers
1136 in SImode, DImode and TImode */
1137 {4, 4, 5}, /* cost of storing SSE registers
1138 in SImode, DImode and TImode */
1139 3, /* MMX or SSE register to integer */
1140 /* On K8:
1141 MOVD reg64, xmmreg Double FSTORE 4
1142 MOVD reg32, xmmreg Double FSTORE 4
1143 On AMDFAM10:
1144 MOVD reg64, xmmreg Double FADD 3
1145 1/1 1/1
1146 MOVD reg32, xmmreg Double FADD 3
1147 1/1 1/1 */
1148 32, /* size of l1 cache. */
1149 512, /* size of l2 cache. */
1150 64, /* size of prefetch block */
1151 100, /* number of parallel prefetches */
1152 2, /* Branch cost */
1153 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1154 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1155 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1156 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1157 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1158 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1160 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1161 very small blocks it is better to use loop. For large blocks, libcall can
1162 do nontemporary accesses and beat inline considerably. */
1163 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
1164 {-1, rep_prefix_4_byte, false}}},
1165 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1166 {-1, libcall, false}}}},
1167 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
1168 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1169 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1170 {-1, libcall, false}}}},
1171 4, /* scalar_stmt_cost. */
1172 2, /* scalar load_cost. */
1173 2, /* scalar_store_cost. */
1174 6, /* vec_stmt_cost. */
1175 0, /* vec_to_scalar_cost. */
1176 2, /* scalar_to_vec_cost. */
1177 2, /* vec_align_load_cost. */
1178 2, /* vec_unalign_load_cost. */
1179 2, /* vec_store_cost. */
1180 2, /* cond_taken_branch_cost. */
1181 1, /* cond_not_taken_branch_cost. */
1184 struct processor_costs btver2_cost = {
1185 COSTS_N_INSNS (1), /* cost of an add instruction */
1186 COSTS_N_INSNS (2), /* cost of a lea instruction */
1187 COSTS_N_INSNS (1), /* variable shift costs */
1188 COSTS_N_INSNS (1), /* constant shift costs */
1189 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1190 COSTS_N_INSNS (4), /* HI */
1191 COSTS_N_INSNS (3), /* SI */
1192 COSTS_N_INSNS (4), /* DI */
1193 COSTS_N_INSNS (5)}, /* other */
1194 0, /* cost of multiply per each bit set */
1195 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1196 COSTS_N_INSNS (35), /* HI */
1197 COSTS_N_INSNS (51), /* SI */
1198 COSTS_N_INSNS (83), /* DI */
1199 COSTS_N_INSNS (83)}, /* other */
1200 COSTS_N_INSNS (1), /* cost of movsx */
1201 COSTS_N_INSNS (1), /* cost of movzx */
1202 8, /* "large" insn */
1203 9, /* MOVE_RATIO */
1204 4, /* cost for loading QImode using movzbl */
1205 {3, 4, 3}, /* cost of loading integer registers
1206 in QImode, HImode and SImode.
1207 Relative to reg-reg move (2). */
1208 {3, 4, 3}, /* cost of storing integer registers */
1209 4, /* cost of reg,reg fld/fst */
1210 {4, 4, 12}, /* cost of loading fp registers
1211 in SFmode, DFmode and XFmode */
1212 {6, 6, 8}, /* cost of storing fp registers
1213 in SFmode, DFmode and XFmode */
1214 2, /* cost of moving MMX register */
1215 {3, 3}, /* cost of loading MMX registers
1216 in SImode and DImode */
1217 {4, 4}, /* cost of storing MMX registers
1218 in SImode and DImode */
1219 2, /* cost of moving SSE register */
1220 {4, 4, 3}, /* cost of loading SSE registers
1221 in SImode, DImode and TImode */
1222 {4, 4, 5}, /* cost of storing SSE registers
1223 in SImode, DImode and TImode */
1224 3, /* MMX or SSE register to integer */
1225 /* On K8:
1226 MOVD reg64, xmmreg Double FSTORE 4
1227 MOVD reg32, xmmreg Double FSTORE 4
1228 On AMDFAM10:
1229 MOVD reg64, xmmreg Double FADD 3
1230 1/1 1/1
1231 MOVD reg32, xmmreg Double FADD 3
1232 1/1 1/1 */
1233 32, /* size of l1 cache. */
1234 2048, /* size of l2 cache. */
1235 64, /* size of prefetch block */
1236 100, /* number of parallel prefetches */
1237 2, /* Branch cost */
1238 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1239 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1240 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1241 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1242 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1243 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1245 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
1246 {-1, rep_prefix_4_byte, false}}},
1247 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1248 {-1, libcall, false}}}},
1249 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
1250 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1251 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1252 {-1, libcall, false}}}},
1253 4, /* scalar_stmt_cost. */
1254 2, /* scalar load_cost. */
1255 2, /* scalar_store_cost. */
1256 6, /* vec_stmt_cost. */
1257 0, /* vec_to_scalar_cost. */
1258 2, /* scalar_to_vec_cost. */
1259 2, /* vec_align_load_cost. */
1260 2, /* vec_unalign_load_cost. */
1261 2, /* vec_store_cost. */
1262 2, /* cond_taken_branch_cost. */
1263 1, /* cond_not_taken_branch_cost. */
1266 static const
1267 struct processor_costs pentium4_cost = {
1268 COSTS_N_INSNS (1), /* cost of an add instruction */
1269 COSTS_N_INSNS (3), /* cost of a lea instruction */
1270 COSTS_N_INSNS (4), /* variable shift costs */
1271 COSTS_N_INSNS (4), /* constant shift costs */
1272 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1273 COSTS_N_INSNS (15), /* HI */
1274 COSTS_N_INSNS (15), /* SI */
1275 COSTS_N_INSNS (15), /* DI */
1276 COSTS_N_INSNS (15)}, /* other */
1277 0, /* cost of multiply per each bit set */
1278 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1279 COSTS_N_INSNS (56), /* HI */
1280 COSTS_N_INSNS (56), /* SI */
1281 COSTS_N_INSNS (56), /* DI */
1282 COSTS_N_INSNS (56)}, /* other */
1283 COSTS_N_INSNS (1), /* cost of movsx */
1284 COSTS_N_INSNS (1), /* cost of movzx */
1285 16, /* "large" insn */
1286 6, /* MOVE_RATIO */
1287 2, /* cost for loading QImode using movzbl */
1288 {4, 5, 4}, /* cost of loading integer registers
1289 in QImode, HImode and SImode.
1290 Relative to reg-reg move (2). */
1291 {2, 3, 2}, /* cost of storing integer registers */
1292 2, /* cost of reg,reg fld/fst */
1293 {2, 2, 6}, /* cost of loading fp registers
1294 in SFmode, DFmode and XFmode */
1295 {4, 4, 6}, /* cost of storing fp registers
1296 in SFmode, DFmode and XFmode */
1297 2, /* cost of moving MMX register */
1298 {2, 2}, /* cost of loading MMX registers
1299 in SImode and DImode */
1300 {2, 2}, /* cost of storing MMX registers
1301 in SImode and DImode */
1302 12, /* cost of moving SSE register */
1303 {12, 12, 12}, /* cost of loading SSE registers
1304 in SImode, DImode and TImode */
1305 {2, 2, 8}, /* cost of storing SSE registers
1306 in SImode, DImode and TImode */
1307 10, /* MMX or SSE register to integer */
1308 8, /* size of l1 cache. */
1309 256, /* size of l2 cache. */
1310 64, /* size of prefetch block */
1311 6, /* number of parallel prefetches */
1312 2, /* Branch cost */
1313 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1314 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1315 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1316 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1317 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1318 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1319 {{libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1320 DUMMY_STRINGOP_ALGS},
1321 {{libcall, {{6, loop_1_byte, false}, {48, loop, false},
1322 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1323 DUMMY_STRINGOP_ALGS},
1324 1, /* scalar_stmt_cost. */
1325 1, /* scalar load_cost. */
1326 1, /* scalar_store_cost. */
1327 1, /* vec_stmt_cost. */
1328 1, /* vec_to_scalar_cost. */
1329 1, /* scalar_to_vec_cost. */
1330 1, /* vec_align_load_cost. */
1331 2, /* vec_unalign_load_cost. */
1332 1, /* vec_store_cost. */
1333 3, /* cond_taken_branch_cost. */
1334 1, /* cond_not_taken_branch_cost. */
1337 static const
1338 struct processor_costs nocona_cost = {
1339 COSTS_N_INSNS (1), /* cost of an add instruction */
1340 COSTS_N_INSNS (1), /* cost of a lea instruction */
1341 COSTS_N_INSNS (1), /* variable shift costs */
1342 COSTS_N_INSNS (1), /* constant shift costs */
1343 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1344 COSTS_N_INSNS (10), /* HI */
1345 COSTS_N_INSNS (10), /* SI */
1346 COSTS_N_INSNS (10), /* DI */
1347 COSTS_N_INSNS (10)}, /* other */
1348 0, /* cost of multiply per each bit set */
1349 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1350 COSTS_N_INSNS (66), /* HI */
1351 COSTS_N_INSNS (66), /* SI */
1352 COSTS_N_INSNS (66), /* DI */
1353 COSTS_N_INSNS (66)}, /* other */
1354 COSTS_N_INSNS (1), /* cost of movsx */
1355 COSTS_N_INSNS (1), /* cost of movzx */
1356 16, /* "large" insn */
1357 17, /* MOVE_RATIO */
1358 4, /* cost for loading QImode using movzbl */
1359 {4, 4, 4}, /* cost of loading integer registers
1360 in QImode, HImode and SImode.
1361 Relative to reg-reg move (2). */
1362 {4, 4, 4}, /* cost of storing integer registers */
1363 3, /* cost of reg,reg fld/fst */
1364 {12, 12, 12}, /* cost of loading fp registers
1365 in SFmode, DFmode and XFmode */
1366 {4, 4, 4}, /* cost of storing fp registers
1367 in SFmode, DFmode and XFmode */
1368 6, /* cost of moving MMX register */
1369 {12, 12}, /* cost of loading MMX registers
1370 in SImode and DImode */
1371 {12, 12}, /* cost of storing MMX registers
1372 in SImode and DImode */
1373 6, /* cost of moving SSE register */
1374 {12, 12, 12}, /* cost of loading SSE registers
1375 in SImode, DImode and TImode */
1376 {12, 12, 12}, /* cost of storing SSE registers
1377 in SImode, DImode and TImode */
1378 8, /* MMX or SSE register to integer */
1379 8, /* size of l1 cache. */
1380 1024, /* size of l2 cache. */
1381 128, /* size of prefetch block */
1382 8, /* number of parallel prefetches */
1383 1, /* Branch cost */
1384 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1385 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1386 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1387 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1388 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1389 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1390 {{libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1391 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1392 {100000, unrolled_loop, false}, {-1, libcall, false}}}},
1393 {{libcall, {{6, loop_1_byte, false}, {48, loop, false},
1394 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1395 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1396 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
1397 1, /* scalar_stmt_cost. */
1398 1, /* scalar load_cost. */
1399 1, /* scalar_store_cost. */
1400 1, /* vec_stmt_cost. */
1401 1, /* vec_to_scalar_cost. */
1402 1, /* scalar_to_vec_cost. */
1403 1, /* vec_align_load_cost. */
1404 2, /* vec_unalign_load_cost. */
1405 1, /* vec_store_cost. */
1406 3, /* cond_taken_branch_cost. */
1407 1, /* cond_not_taken_branch_cost. */
1410 static const
1411 struct processor_costs atom_cost = {
1412 COSTS_N_INSNS (1), /* cost of an add instruction */
1413 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1414 COSTS_N_INSNS (1), /* variable shift costs */
1415 COSTS_N_INSNS (1), /* constant shift costs */
1416 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1417 COSTS_N_INSNS (4), /* HI */
1418 COSTS_N_INSNS (3), /* SI */
1419 COSTS_N_INSNS (4), /* DI */
1420 COSTS_N_INSNS (2)}, /* other */
1421 0, /* cost of multiply per each bit set */
1422 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1423 COSTS_N_INSNS (26), /* HI */
1424 COSTS_N_INSNS (42), /* SI */
1425 COSTS_N_INSNS (74), /* DI */
1426 COSTS_N_INSNS (74)}, /* other */
1427 COSTS_N_INSNS (1), /* cost of movsx */
1428 COSTS_N_INSNS (1), /* cost of movzx */
1429 8, /* "large" insn */
1430 17, /* MOVE_RATIO */
1431 4, /* cost for loading QImode using movzbl */
1432 {4, 4, 4}, /* cost of loading integer registers
1433 in QImode, HImode and SImode.
1434 Relative to reg-reg move (2). */
1435 {4, 4, 4}, /* cost of storing integer registers */
1436 4, /* cost of reg,reg fld/fst */
1437 {12, 12, 12}, /* cost of loading fp registers
1438 in SFmode, DFmode and XFmode */
1439 {6, 6, 8}, /* cost of storing fp registers
1440 in SFmode, DFmode and XFmode */
1441 2, /* cost of moving MMX register */
1442 {8, 8}, /* cost of loading MMX registers
1443 in SImode and DImode */
1444 {8, 8}, /* cost of storing MMX registers
1445 in SImode and DImode */
1446 2, /* cost of moving SSE register */
1447 {8, 8, 8}, /* cost of loading SSE registers
1448 in SImode, DImode and TImode */
1449 {8, 8, 8}, /* cost of storing SSE registers
1450 in SImode, DImode and TImode */
1451 5, /* MMX or SSE register to integer */
1452 32, /* size of l1 cache. */
1453 256, /* size of l2 cache. */
1454 64, /* size of prefetch block */
1455 6, /* number of parallel prefetches */
1456 3, /* Branch cost */
1457 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1458 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1459 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1460 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1461 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1462 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1463 {{libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1464 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1465 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
1466 {{libcall, {{8, loop, false}, {15, unrolled_loop, false},
1467 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1468 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1469 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
1470 1, /* scalar_stmt_cost. */
1471 1, /* scalar load_cost. */
1472 1, /* scalar_store_cost. */
1473 1, /* vec_stmt_cost. */
1474 1, /* vec_to_scalar_cost. */
1475 1, /* scalar_to_vec_cost. */
1476 1, /* vec_align_load_cost. */
1477 2, /* vec_unalign_load_cost. */
1478 1, /* vec_store_cost. */
1479 3, /* cond_taken_branch_cost. */
1480 1, /* cond_not_taken_branch_cost. */
1483 /* Generic64 should produce code tuned for Nocona and K8. */
1484 static const
1485 struct processor_costs generic64_cost = {
1486 COSTS_N_INSNS (1), /* cost of an add instruction */
1487 /* On all chips taken into consideration lea is 2 cycles and more. With
1488 this cost however our current implementation of synth_mult results in
1489 use of unnecessary temporary registers causing regression on several
1490 SPECfp benchmarks. */
1491 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1492 COSTS_N_INSNS (1), /* variable shift costs */
1493 COSTS_N_INSNS (1), /* constant shift costs */
1494 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1495 COSTS_N_INSNS (4), /* HI */
1496 COSTS_N_INSNS (3), /* SI */
1497 COSTS_N_INSNS (4), /* DI */
1498 COSTS_N_INSNS (2)}, /* other */
1499 0, /* cost of multiply per each bit set */
1500 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1501 COSTS_N_INSNS (26), /* HI */
1502 COSTS_N_INSNS (42), /* SI */
1503 COSTS_N_INSNS (74), /* DI */
1504 COSTS_N_INSNS (74)}, /* other */
1505 COSTS_N_INSNS (1), /* cost of movsx */
1506 COSTS_N_INSNS (1), /* cost of movzx */
1507 8, /* "large" insn */
1508 17, /* MOVE_RATIO */
1509 4, /* cost for loading QImode using movzbl */
1510 {4, 4, 4}, /* cost of loading integer registers
1511 in QImode, HImode and SImode.
1512 Relative to reg-reg move (2). */
1513 {4, 4, 4}, /* cost of storing integer registers */
1514 4, /* cost of reg,reg fld/fst */
1515 {12, 12, 12}, /* cost of loading fp registers
1516 in SFmode, DFmode and XFmode */
1517 {6, 6, 8}, /* cost of storing fp registers
1518 in SFmode, DFmode and XFmode */
1519 2, /* cost of moving MMX register */
1520 {8, 8}, /* cost of loading MMX registers
1521 in SImode and DImode */
1522 {8, 8}, /* cost of storing MMX registers
1523 in SImode and DImode */
1524 2, /* cost of moving SSE register */
1525 {8, 8, 8}, /* cost of loading SSE registers
1526 in SImode, DImode and TImode */
1527 {8, 8, 8}, /* cost of storing SSE registers
1528 in SImode, DImode and TImode */
1529 5, /* MMX or SSE register to integer */
1530 32, /* size of l1 cache. */
1531 512, /* size of l2 cache. */
1532 64, /* size of prefetch block */
1533 6, /* number of parallel prefetches */
1534 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1535 value is increased to perhaps more appropriate value of 5. */
1536 3, /* Branch cost */
1537 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1538 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1539 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1540 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1541 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1542 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1543 {DUMMY_STRINGOP_ALGS,
1544 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1545 {-1, libcall, false}}}},
1546 {DUMMY_STRINGOP_ALGS,
1547 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1548 {-1, libcall, false}}}},
1549 1, /* scalar_stmt_cost. */
1550 1, /* scalar load_cost. */
1551 1, /* scalar_store_cost. */
1552 1, /* vec_stmt_cost. */
1553 1, /* vec_to_scalar_cost. */
1554 1, /* scalar_to_vec_cost. */
1555 1, /* vec_align_load_cost. */
1556 2, /* vec_unalign_load_cost. */
1557 1, /* vec_store_cost. */
1558 3, /* cond_taken_branch_cost. */
1559 1, /* cond_not_taken_branch_cost. */
1562 /* core_cost should produce code tuned for Core familly of CPUs. */
1563 static const
1564 struct processor_costs core_cost = {
1565 COSTS_N_INSNS (1), /* cost of an add instruction */
1566 /* On all chips taken into consideration lea is 2 cycles and more. With
1567 this cost however our current implementation of synth_mult results in
1568 use of unnecessary temporary registers causing regression on several
1569 SPECfp benchmarks. */
1570 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1571 COSTS_N_INSNS (1), /* variable shift costs */
1572 COSTS_N_INSNS (1), /* constant shift costs */
1573 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1574 COSTS_N_INSNS (4), /* HI */
1575 COSTS_N_INSNS (3), /* SI */
1576 COSTS_N_INSNS (4), /* DI */
1577 COSTS_N_INSNS (2)}, /* other */
1578 0, /* cost of multiply per each bit set */
1579 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1580 COSTS_N_INSNS (26), /* HI */
1581 COSTS_N_INSNS (42), /* SI */
1582 COSTS_N_INSNS (74), /* DI */
1583 COSTS_N_INSNS (74)}, /* other */
1584 COSTS_N_INSNS (1), /* cost of movsx */
1585 COSTS_N_INSNS (1), /* cost of movzx */
1586 8, /* "large" insn */
1587 17, /* MOVE_RATIO */
1588 4, /* cost for loading QImode using movzbl */
1589 {4, 4, 4}, /* cost of loading integer registers
1590 in QImode, HImode and SImode.
1591 Relative to reg-reg move (2). */
1592 {4, 4, 4}, /* cost of storing integer registers */
1593 4, /* cost of reg,reg fld/fst */
1594 {12, 12, 12}, /* cost of loading fp registers
1595 in SFmode, DFmode and XFmode */
1596 {6, 6, 8}, /* cost of storing fp registers
1597 in SFmode, DFmode and XFmode */
1598 2, /* cost of moving MMX register */
1599 {8, 8}, /* cost of loading MMX registers
1600 in SImode and DImode */
1601 {8, 8}, /* cost of storing MMX registers
1602 in SImode and DImode */
1603 2, /* cost of moving SSE register */
1604 {8, 8, 8}, /* cost of loading SSE registers
1605 in SImode, DImode and TImode */
1606 {8, 8, 8}, /* cost of storing SSE registers
1607 in SImode, DImode and TImode */
1608 5, /* MMX or SSE register to integer */
1609 64, /* size of l1 cache. */
1610 512, /* size of l2 cache. */
1611 64, /* size of prefetch block */
1612 6, /* number of parallel prefetches */
1613 /* FIXME perhaps more appropriate value is 5. */
1614 3, /* Branch cost */
1615 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1616 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1617 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1618 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1619 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1620 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1621 {{libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
1622 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
1623 {-1, libcall, false}}}},
1624 {{libcall, {{6, loop_1_byte, true},
1625 {24, loop, true},
1626 {8192, rep_prefix_4_byte, true},
1627 {-1, libcall, false}}},
1628 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
1629 {-1, libcall, false}}}},
1630 1, /* scalar_stmt_cost. */
1631 1, /* scalar load_cost. */
1632 1, /* scalar_store_cost. */
1633 1, /* vec_stmt_cost. */
1634 1, /* vec_to_scalar_cost. */
1635 1, /* scalar_to_vec_cost. */
1636 1, /* vec_align_load_cost. */
1637 2, /* vec_unalign_load_cost. */
1638 1, /* vec_store_cost. */
1639 3, /* cond_taken_branch_cost. */
1640 1, /* cond_not_taken_branch_cost. */
1643 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
1644 Athlon and K8. */
1645 static const
1646 struct processor_costs generic32_cost = {
1647 COSTS_N_INSNS (1), /* cost of an add instruction */
1648 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1649 COSTS_N_INSNS (1), /* variable shift costs */
1650 COSTS_N_INSNS (1), /* constant shift costs */
1651 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1652 COSTS_N_INSNS (4), /* HI */
1653 COSTS_N_INSNS (3), /* SI */
1654 COSTS_N_INSNS (4), /* DI */
1655 COSTS_N_INSNS (2)}, /* other */
1656 0, /* cost of multiply per each bit set */
1657 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1658 COSTS_N_INSNS (26), /* HI */
1659 COSTS_N_INSNS (42), /* SI */
1660 COSTS_N_INSNS (74), /* DI */
1661 COSTS_N_INSNS (74)}, /* other */
1662 COSTS_N_INSNS (1), /* cost of movsx */
1663 COSTS_N_INSNS (1), /* cost of movzx */
1664 8, /* "large" insn */
1665 17, /* MOVE_RATIO */
1666 4, /* cost for loading QImode using movzbl */
1667 {4, 4, 4}, /* cost of loading integer registers
1668 in QImode, HImode and SImode.
1669 Relative to reg-reg move (2). */
1670 {4, 4, 4}, /* cost of storing integer registers */
1671 4, /* cost of reg,reg fld/fst */
1672 {12, 12, 12}, /* cost of loading fp registers
1673 in SFmode, DFmode and XFmode */
1674 {6, 6, 8}, /* cost of storing fp registers
1675 in SFmode, DFmode and XFmode */
1676 2, /* cost of moving MMX register */
1677 {8, 8}, /* cost of loading MMX registers
1678 in SImode and DImode */
1679 {8, 8}, /* cost of storing MMX registers
1680 in SImode and DImode */
1681 2, /* cost of moving SSE register */
1682 {8, 8, 8}, /* cost of loading SSE registers
1683 in SImode, DImode and TImode */
1684 {8, 8, 8}, /* cost of storing SSE registers
1685 in SImode, DImode and TImode */
1686 5, /* MMX or SSE register to integer */
1687 32, /* size of l1 cache. */
1688 256, /* size of l2 cache. */
1689 64, /* size of prefetch block */
1690 6, /* number of parallel prefetches */
1691 3, /* Branch cost */
1692 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1693 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1694 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1695 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1696 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1697 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1698 {{libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1699 {-1, libcall, false}}},
1700 DUMMY_STRINGOP_ALGS},
1701 {{libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1702 {-1, libcall, false}}},
1703 DUMMY_STRINGOP_ALGS},
1704 1, /* scalar_stmt_cost. */
1705 1, /* scalar load_cost. */
1706 1, /* scalar_store_cost. */
1707 1, /* vec_stmt_cost. */
1708 1, /* vec_to_scalar_cost. */
1709 1, /* scalar_to_vec_cost. */
1710 1, /* vec_align_load_cost. */
1711 2, /* vec_unalign_load_cost. */
1712 1, /* vec_store_cost. */
1713 3, /* cond_taken_branch_cost. */
1714 1, /* cond_not_taken_branch_cost. */
1717 /* Set by -mtune. */
1718 const struct processor_costs *ix86_tune_cost = &pentium_cost;
1720 /* Set by -mtune or -Os. */
1721 const struct processor_costs *ix86_cost = &pentium_cost;
1723 /* Processor feature/optimization bitmasks. */
1724 #define m_386 (1<<PROCESSOR_I386)
1725 #define m_486 (1<<PROCESSOR_I486)
1726 #define m_PENT (1<<PROCESSOR_PENTIUM)
1727 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1728 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1729 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1730 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1731 #define m_CORE2 (1<<PROCESSOR_CORE2)
1732 #define m_COREI7 (1<<PROCESSOR_COREI7)
1733 #define m_CORE2I7 (m_CORE2 | m_COREI7)
1734 #define m_ATOM (1<<PROCESSOR_ATOM)
1736 #define m_GEODE (1<<PROCESSOR_GEODE)
1737 #define m_K6 (1<<PROCESSOR_K6)
1738 #define m_K6_GEODE (m_K6 | m_GEODE)
1739 #define m_K8 (1<<PROCESSOR_K8)
1740 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1741 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1742 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1743 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1744 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1745 #define m_BDVER3 (1<<PROCESSOR_BDVER3)
1746 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3)
1747 #define m_BTVER (m_BTVER1 | m_BTVER2)
1748 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1749 #define m_BTVER2 (1<<PROCESSOR_BTVER2)
1750 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER)
1752 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1753 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1755 /* Generic instruction choice should be common subset of supported CPUs
1756 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
1757 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
1759 /* Feature tests against the various tunings. */
1760 unsigned char ix86_tune_features[X86_TUNE_LAST];
1762 /* Feature tests against the various tunings used to create ix86_tune_features
1763 based on the processor mask. */
1764 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
1765 /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
1766 negatively, so enabling for Generic64 seems like good code size
1767 tradeoff. We can't enable it for 32bit generic because it does not
1768 work well with PPro base chips. */
1769 m_386 | m_CORE2I7 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
1771 /* X86_TUNE_PUSH_MEMORY */
1772 m_386 | m_P4_NOCONA | m_CORE2I7 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1774 /* X86_TUNE_ZERO_EXTEND_WITH_AND */
1775 m_486 | m_PENT,
1777 /* X86_TUNE_UNROLL_STRLEN */
1778 m_486 | m_PENT | m_PPRO | m_ATOM | m_CORE2I7 | m_K6 | m_AMD_MULTIPLE | m_GENERIC,
1780 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
1781 on simulation result. But after P4 was made, no performance benefit
1782 was observed with branch hints. It also increases the code size.
1783 As a result, icc never generates branch hints. */
1786 /* X86_TUNE_DOUBLE_WITH_ADD */
1787 ~m_386,
1789 /* X86_TUNE_USE_SAHF */
1790 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC,
1792 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
1793 partial dependencies. */
1794 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1796 /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
1797 register stalls on Generic32 compilation setting as well. However
1798 in current implementation the partial register stalls are not eliminated
1799 very well - they can be introduced via subregs synthesized by combine
1800 and can happen in caller/callee saving sequences. Because this option
1801 pays back little on PPro based chips and is in conflict with partial reg
1802 dependencies used by Athlon/P4 based chips, it is better to leave it off
1803 for generic32 for now. */
1804 m_PPRO,
1806 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
1807 m_CORE2I7 | m_GENERIC,
1809 /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
1810 * on 16-bit immediate moves into memory on Core2 and Corei7. */
1811 m_CORE2I7 | m_GENERIC,
1813 /* X86_TUNE_USE_HIMODE_FIOP */
1814 m_386 | m_486 | m_K6_GEODE,
1816 /* X86_TUNE_USE_SIMODE_FIOP */
1817 ~(m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC),
1819 /* X86_TUNE_USE_MOV0 */
1820 m_K6,
1822 /* X86_TUNE_USE_CLTD */
1823 ~(m_PENT | m_ATOM | m_K6),
1825 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
1826 m_PENT4,
1828 /* X86_TUNE_SPLIT_LONG_MOVES */
1829 m_PPRO,
1831 /* X86_TUNE_READ_MODIFY_WRITE */
1832 ~m_PENT,
1834 /* X86_TUNE_READ_MODIFY */
1835 ~(m_PENT | m_PPRO),
1837 /* X86_TUNE_PROMOTE_QIMODE */
1838 m_386 | m_486 | m_PENT | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1840 /* X86_TUNE_FAST_PREFIX */
1841 ~(m_386 | m_486 | m_PENT),
1843 /* X86_TUNE_SINGLE_STRINGOP */
1844 m_386 | m_P4_NOCONA,
1846 /* X86_TUNE_QIMODE_MATH */
1849 /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
1850 register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option
1851 might be considered for Generic32 if our scheme for avoiding partial
1852 stalls was more effective. */
1853 ~m_PPRO,
1855 /* X86_TUNE_PROMOTE_QI_REGS */
1858 /* X86_TUNE_PROMOTE_HI_REGS */
1859 m_PPRO,
1861 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
1862 over esp addition. */
1863 m_386 | m_486 | m_PENT | m_PPRO,
1865 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
1866 over esp addition. */
1867 m_PENT,
1869 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
1870 over esp subtraction. */
1871 m_386 | m_486 | m_PENT | m_K6_GEODE,
1873 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
1874 over esp subtraction. */
1875 m_PENT | m_K6_GEODE,
1877 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
1878 for DFmode copies */
1879 ~(m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_ATOM | m_GENERIC),
1881 /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
1882 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
1884 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
1885 conflict here in between PPro/Pentium4 based chips that thread 128bit
1886 SSE registers as single units versus K8 based chips that divide SSE
1887 registers to two 64bit halves. This knob promotes all store destinations
1888 to be 128bit to allow register renaming on 128bit SSE units, but usually
1889 results in one extra microop on 64bit SSE units. Experimental results
1890 shows that disabling this option on P4 brings over 20% SPECfp regression,
1891 while enabling it on K8 brings roughly 2.4% regression that can be partly
1892 masked by careful scheduling of moves. */
1893 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMDFAM10 | m_BDVER | m_GENERIC,
1895 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
1896 m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER,
1898 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */
1899 m_COREI7 | m_BDVER,
1901 /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */
1902 m_BDVER ,
1904 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
1905 are resolved on SSE register parts instead of whole registers, so we may
1906 maintain just lower part of scalar values in proper format leaving the
1907 upper part undefined. */
1908 m_ATHLON_K8,
1910 /* X86_TUNE_SSE_TYPELESS_STORES */
1911 m_AMD_MULTIPLE,
1913 /* X86_TUNE_SSE_LOAD0_BY_PXOR */
1914 m_PPRO | m_P4_NOCONA,
1916 /* X86_TUNE_MEMORY_MISMATCH_STALL */
1917 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
1919 /* X86_TUNE_PROLOGUE_USING_MOVE */
1920 m_PPRO | m_ATHLON_K8,
1922 /* X86_TUNE_EPILOGUE_USING_MOVE */
1923 m_PPRO | m_ATHLON_K8,
1925 /* X86_TUNE_SHIFT1 */
1926 ~m_486,
1928 /* X86_TUNE_USE_FFREEP */
1929 m_AMD_MULTIPLE,
1931 /* X86_TUNE_INTER_UNIT_MOVES */
1932 ~(m_AMD_MULTIPLE | m_GENERIC),
1934 /* X86_TUNE_INTER_UNIT_CONVERSIONS */
1935 ~(m_AMDFAM10 | m_BDVER ),
1937 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
1938 than 4 branch instructions in the 16 byte window. */
1939 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
1941 /* X86_TUNE_SCHEDULE */
1942 m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1944 /* X86_TUNE_USE_BT */
1945 m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
1947 /* X86_TUNE_USE_INCDEC */
1948 ~(m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GENERIC),
1950 /* X86_TUNE_PAD_RETURNS */
1951 m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC,
1953 /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */
1954 m_ATOM,
1956 /* X86_TUNE_EXT_80387_CONSTANTS */
1957 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC,
1959 /* X86_TUNE_AVOID_VECTOR_DECODE */
1960 m_CORE2I7 | m_K8 | m_GENERIC64,
1962 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
1963 and SImode multiply, but 386 and 486 do HImode multiply faster. */
1964 ~(m_386 | m_486),
1966 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
1967 vector path on AMD machines. */
1968 m_CORE2I7 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
1970 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
1971 machines. */
1972 m_CORE2I7 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
1974 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
1975 than a MOV. */
1976 m_PENT,
1978 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
1979 but one byte longer. */
1980 m_PENT,
1982 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
1983 operand that cannot be represented using a modRM byte. The XOR
1984 replacement is long decoded, so this split helps here as well. */
1985 m_K6,
1987 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
1988 from FP to FP. */
1989 m_CORE2I7 | m_AMDFAM10 | m_GENERIC,
1991 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
1992 from integer to FP. */
1993 m_AMDFAM10,
1995 /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
1996 with a subsequent conditional jump instruction into a single
1997 compare-and-branch uop. */
1998 m_BDVER,
2000 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
2001 will impact LEA instruction selection. */
2002 m_ATOM,
2004 /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
2005 instructions. */
2006 ~m_ATOM,
2008 /* X86_SOFTARE_PREFETCHING_BENEFICIAL: Enable software prefetching
2009 at -O3. For the moment, the prefetching seems badly tuned for Intel
2010 chips. */
2011 m_K6_GEODE | m_AMD_MULTIPLE,
2013 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
2014 the auto-vectorizer. */
2015 m_BDVER | m_BTVER2,
2017 /* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations
2018 during reassociation of integer computation. */
2019 m_ATOM,
2021 /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
2022 during reassociation of fp computation. */
2023 m_ATOM,
2025 /* X86_TUNE_GENERAL_REGS_SSE_SPILL: Try to spill general regs to SSE
2026 regs instead of memory. */
2027 m_COREI7 | m_CORE2I7,
2029 /* X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE: Try to avoid memory operands for
2030 a conditional move. */
2031 m_ATOM
2034 /* Feature tests against the various architecture variations. */
2035 unsigned char ix86_arch_features[X86_ARCH_LAST];
2037 /* Feature tests against the various architecture variations, used to create
2038 ix86_arch_features based on the processor mask. */
2039 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2040 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
2041 ~(m_386 | m_486 | m_PENT | m_K6),
2043 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2044 ~m_386,
2046 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2047 ~(m_386 | m_486),
2049 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2050 ~m_386,
2052 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2053 ~m_386,
2056 static const unsigned int x86_accumulate_outgoing_args
2057 = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC;
2059 static const unsigned int x86_arch_always_fancy_math_387
2060 = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC;
2062 static const unsigned int x86_avx256_split_unaligned_load
2063 = m_COREI7 | m_GENERIC;
2065 static const unsigned int x86_avx256_split_unaligned_store
2066 = m_COREI7 | m_BDVER | m_GENERIC;
2068 /* In case the average insn count for single function invocation is
2069 lower than this constant, emit fast (but longer) prologue and
2070 epilogue code. */
2071 #define FAST_PROLOGUE_INSN_COUNT 20
2073 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2074 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2075 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2076 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2078 /* Array of the smallest class containing reg number REGNO, indexed by
2079 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2081 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2083 /* ax, dx, cx, bx */
2084 AREG, DREG, CREG, BREG,
2085 /* si, di, bp, sp */
2086 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2087 /* FP registers */
2088 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2089 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2090 /* arg pointer */
2091 NON_Q_REGS,
2092 /* flags, fpsr, fpcr, frame */
2093 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2094 /* SSE registers */
2095 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2096 SSE_REGS, SSE_REGS,
2097 /* MMX registers */
2098 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2099 MMX_REGS, MMX_REGS,
2100 /* REX registers */
2101 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2102 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2103 /* SSE REX registers */
2104 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2105 SSE_REGS, SSE_REGS,
2108 /* The "default" register map used in 32bit mode. */
2110 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2112 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2113 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2114 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2115 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2116 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2117 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2118 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2121 /* The "default" register map used in 64bit mode. */
2123 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2125 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2126 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2127 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2128 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2129 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2130 8,9,10,11,12,13,14,15, /* extended integer registers */
2131 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2134 /* Define the register numbers to be used in Dwarf debugging information.
2135 The SVR4 reference port C compiler uses the following register numbers
2136 in its Dwarf output code:
2137 0 for %eax (gcc regno = 0)
2138 1 for %ecx (gcc regno = 2)
2139 2 for %edx (gcc regno = 1)
2140 3 for %ebx (gcc regno = 3)
2141 4 for %esp (gcc regno = 7)
2142 5 for %ebp (gcc regno = 6)
2143 6 for %esi (gcc regno = 4)
2144 7 for %edi (gcc regno = 5)
2145 The following three DWARF register numbers are never generated by
2146 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2147 believes these numbers have these meanings.
2148 8 for %eip (no gcc equivalent)
2149 9 for %eflags (gcc regno = 17)
2150 10 for %trapno (no gcc equivalent)
2151 It is not at all clear how we should number the FP stack registers
2152 for the x86 architecture. If the version of SDB on x86/svr4 were
2153 a bit less brain dead with respect to floating-point then we would
2154 have a precedent to follow with respect to DWARF register numbers
2155 for x86 FP registers, but the SDB on x86/svr4 is so completely
2156 broken with respect to FP registers that it is hardly worth thinking
2157 of it as something to strive for compatibility with.
2158 The version of x86/svr4 SDB I have at the moment does (partially)
2159 seem to believe that DWARF register number 11 is associated with
2160 the x86 register %st(0), but that's about all. Higher DWARF
2161 register numbers don't seem to be associated with anything in
2162 particular, and even for DWARF regno 11, SDB only seems to under-
2163 stand that it should say that a variable lives in %st(0) (when
2164 asked via an `=' command) if we said it was in DWARF regno 11,
2165 but SDB still prints garbage when asked for the value of the
2166 variable in question (via a `/' command).
2167 (Also note that the labels SDB prints for various FP stack regs
2168 when doing an `x' command are all wrong.)
2169 Note that these problems generally don't affect the native SVR4
2170 C compiler because it doesn't allow the use of -O with -g and
2171 because when it is *not* optimizing, it allocates a memory
2172 location for each floating-point variable, and the memory
2173 location is what gets described in the DWARF AT_location
2174 attribute for the variable in question.
2175 Regardless of the severe mental illness of the x86/svr4 SDB, we
2176 do something sensible here and we use the following DWARF
2177 register numbers. Note that these are all stack-top-relative
2178 numbers.
2179 11 for %st(0) (gcc regno = 8)
2180 12 for %st(1) (gcc regno = 9)
2181 13 for %st(2) (gcc regno = 10)
2182 14 for %st(3) (gcc regno = 11)
2183 15 for %st(4) (gcc regno = 12)
2184 16 for %st(5) (gcc regno = 13)
2185 17 for %st(6) (gcc regno = 14)
2186 18 for %st(7) (gcc regno = 15)
2188 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2190 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2191 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2192 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2193 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2194 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2195 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2196 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2199 /* Define parameter passing and return registers. */
2201 static int const x86_64_int_parameter_registers[6] =
2203 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2206 static int const x86_64_ms_abi_int_parameter_registers[4] =
2208 CX_REG, DX_REG, R8_REG, R9_REG
2211 static int const x86_64_int_return_registers[4] =
2213 AX_REG, DX_REG, DI_REG, SI_REG
2216 /* Define the structure for the machine field in struct function. */
2218 struct GTY(()) stack_local_entry {
2219 unsigned short mode;
2220 unsigned short n;
2221 rtx rtl;
2222 struct stack_local_entry *next;
2225 /* Structure describing stack frame layout.
2226 Stack grows downward:
2228 [arguments]
2229 <- ARG_POINTER
2230 saved pc
2232 saved static chain if ix86_static_chain_on_stack
2234 saved frame pointer if frame_pointer_needed
2235 <- HARD_FRAME_POINTER
2236 [saved regs]
2237 <- regs_save_offset
2238 [padding0]
2240 [saved SSE regs]
2241 <- sse_regs_save_offset
2242 [padding1] |
2243 | <- FRAME_POINTER
2244 [va_arg registers] |
2246 [frame] |
2248 [padding2] | = to_allocate
2249 <- STACK_POINTER
2251 struct ix86_frame
2253 int nsseregs;
2254 int nregs;
2255 int va_arg_size;
2256 int red_zone_size;
2257 int outgoing_arguments_size;
2259 /* The offsets relative to ARG_POINTER. */
2260 HOST_WIDE_INT frame_pointer_offset;
2261 HOST_WIDE_INT hard_frame_pointer_offset;
2262 HOST_WIDE_INT stack_pointer_offset;
2263 HOST_WIDE_INT hfp_save_offset;
2264 HOST_WIDE_INT reg_save_offset;
2265 HOST_WIDE_INT sse_reg_save_offset;
2267 /* When save_regs_using_mov is set, emit prologue using
2268 move instead of push instructions. */
2269 bool save_regs_using_mov;
2272 /* Which cpu are we scheduling for. */
2273 enum attr_cpu ix86_schedule;
2275 /* Which cpu are we optimizing for. */
2276 enum processor_type ix86_tune;
2278 /* Which instruction set architecture to use. */
2279 enum processor_type ix86_arch;
2281 /* True if processor has SSE prefetch instruction. */
2282 unsigned char x86_prefetch_sse;
2284 /* -mstackrealign option */
2285 static const char ix86_force_align_arg_pointer_string[]
2286 = "force_align_arg_pointer";
2288 static rtx (*ix86_gen_leave) (void);
2289 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2290 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2291 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2292 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2293 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2294 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2295 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2296 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2297 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2298 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2299 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2301 /* Preferred alignment for stack boundary in bits. */
2302 unsigned int ix86_preferred_stack_boundary;
2304 /* Alignment for incoming stack boundary in bits specified at
2305 command line. */
2306 static unsigned int ix86_user_incoming_stack_boundary;
2308 /* Default alignment for incoming stack boundary in bits. */
2309 static unsigned int ix86_default_incoming_stack_boundary;
2311 /* Alignment for incoming stack boundary in bits. */
2312 unsigned int ix86_incoming_stack_boundary;
2314 /* Calling abi specific va_list type nodes. */
2315 static GTY(()) tree sysv_va_list_type_node;
2316 static GTY(()) tree ms_va_list_type_node;
2318 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2319 char internal_label_prefix[16];
2320 int internal_label_prefix_len;
2322 /* Fence to use after loop using movnt. */
2323 tree x86_mfence;
2325 /* Register class used for passing given 64bit part of the argument.
2326 These represent classes as documented by the PS ABI, with the exception
2327 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2328 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2330 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2331 whenever possible (upper half does contain padding). */
2332 enum x86_64_reg_class
2334 X86_64_NO_CLASS,
2335 X86_64_INTEGER_CLASS,
2336 X86_64_INTEGERSI_CLASS,
2337 X86_64_SSE_CLASS,
2338 X86_64_SSESF_CLASS,
2339 X86_64_SSEDF_CLASS,
2340 X86_64_SSEUP_CLASS,
2341 X86_64_X87_CLASS,
2342 X86_64_X87UP_CLASS,
2343 X86_64_COMPLEX_X87_CLASS,
2344 X86_64_MEMORY_CLASS
2347 #define MAX_CLASSES 4
2349 /* Table of constants used by fldpi, fldln2, etc.... */
2350 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2351 static bool ext_80387_constants_init = 0;
2354 static struct machine_function * ix86_init_machine_status (void);
2355 static rtx ix86_function_value (const_tree, const_tree, bool);
2356 static bool ix86_function_value_regno_p (const unsigned int);
2357 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2358 const_tree);
2359 static rtx ix86_static_chain (const_tree, bool);
2360 static int ix86_function_regparm (const_tree, const_tree);
2361 static void ix86_compute_frame_layout (struct ix86_frame *);
2362 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2363 rtx, rtx, int);
2364 static void ix86_add_new_builtins (HOST_WIDE_INT);
2365 static tree ix86_canonical_va_list_type (tree);
2366 static void predict_jump (int);
2367 static unsigned int split_stack_prologue_scratch_regno (void);
2368 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2370 enum ix86_function_specific_strings
2372 IX86_FUNCTION_SPECIFIC_ARCH,
2373 IX86_FUNCTION_SPECIFIC_TUNE,
2374 IX86_FUNCTION_SPECIFIC_MAX
2377 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2378 const char *, enum fpmath_unit, bool);
2379 static void ix86_debug_options (void) ATTRIBUTE_UNUSED;
2380 static void ix86_function_specific_save (struct cl_target_option *);
2381 static void ix86_function_specific_restore (struct cl_target_option *);
2382 static void ix86_function_specific_print (FILE *, int,
2383 struct cl_target_option *);
2384 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2385 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2386 struct gcc_options *);
2387 static bool ix86_can_inline_p (tree, tree);
2388 static void ix86_set_current_function (tree);
2389 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2391 static enum calling_abi ix86_function_abi (const_tree);
2394 #ifndef SUBTARGET32_DEFAULT_CPU
2395 #define SUBTARGET32_DEFAULT_CPU "i386"
2396 #endif
2398 /* Whether -mtune= or -march= were specified */
2399 static int ix86_tune_defaulted;
2400 static int ix86_arch_specified;
2402 /* Vectorization library interface and handlers. */
2403 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2405 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2406 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2408 /* Processor target table, indexed by processor number */
2409 struct ptt
2411 const struct processor_costs *cost; /* Processor costs */
2412 const int align_loop; /* Default alignments. */
2413 const int align_loop_max_skip;
2414 const int align_jump;
2415 const int align_jump_max_skip;
2416 const int align_func;
2419 static const struct ptt processor_target_table[PROCESSOR_max] =
2421 {&i386_cost, 4, 3, 4, 3, 4},
2422 {&i486_cost, 16, 15, 16, 15, 16},
2423 {&pentium_cost, 16, 7, 16, 7, 16},
2424 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2425 {&geode_cost, 0, 0, 0, 0, 0},
2426 {&k6_cost, 32, 7, 32, 7, 32},
2427 {&athlon_cost, 16, 7, 16, 7, 16},
2428 {&pentium4_cost, 0, 0, 0, 0, 0},
2429 {&k8_cost, 16, 7, 16, 7, 16},
2430 {&nocona_cost, 0, 0, 0, 0, 0},
2431 /* Core 2 */
2432 {&core_cost, 16, 10, 16, 10, 16},
2433 /* Core i7 */
2434 {&core_cost, 16, 10, 16, 10, 16},
2435 {&generic32_cost, 16, 7, 16, 7, 16},
2436 {&generic64_cost, 16, 10, 16, 10, 16},
2437 {&amdfam10_cost, 32, 24, 32, 7, 32},
2438 {&bdver1_cost, 32, 24, 32, 7, 32},
2439 {&bdver2_cost, 32, 24, 32, 7, 32},
2440 {&bdver3_cost, 32, 24, 32, 7, 32},
2441 {&btver1_cost, 32, 24, 32, 7, 32},
2442 {&btver2_cost, 32, 24, 32, 7, 32},
2443 {&atom_cost, 16, 15, 16, 7, 16}
2446 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2448 "generic",
2449 "i386",
2450 "i486",
2451 "pentium",
2452 "pentium-mmx",
2453 "pentiumpro",
2454 "pentium2",
2455 "pentium3",
2456 "pentium4",
2457 "pentium-m",
2458 "prescott",
2459 "nocona",
2460 "core2",
2461 "corei7",
2462 "atom",
2463 "geode",
2464 "k6",
2465 "k6-2",
2466 "k6-3",
2467 "athlon",
2468 "athlon-4",
2469 "k8",
2470 "amdfam10",
2471 "bdver1",
2472 "bdver2",
2473 "bdver3",
2474 "btver1",
2475 "btver2"
2478 static bool
2479 gate_insert_vzeroupper (void)
2481 return TARGET_VZEROUPPER;
2484 static unsigned int
2485 rest_of_handle_insert_vzeroupper (void)
2487 int i;
2489 /* vzeroupper instructions are inserted immediately after reload to
2490 account for possible spills from 256bit registers. The pass
2491 reuses mode switching infrastructure by re-running mode insertion
2492 pass, so disable entities that have already been processed. */
2493 for (i = 0; i < MAX_386_ENTITIES; i++)
2494 ix86_optimize_mode_switching[i] = 0;
2496 ix86_optimize_mode_switching[AVX_U128] = 1;
2498 /* Call optimize_mode_switching. */
2499 pass_mode_switching.pass.execute ();
2500 return 0;
2503 struct rtl_opt_pass pass_insert_vzeroupper =
2506 RTL_PASS,
2507 "vzeroupper", /* name */
2508 OPTGROUP_NONE, /* optinfo_flags */
2509 gate_insert_vzeroupper, /* gate */
2510 rest_of_handle_insert_vzeroupper, /* execute */
2511 NULL, /* sub */
2512 NULL, /* next */
2513 0, /* static_pass_number */
2514 TV_NONE, /* tv_id */
2515 0, /* properties_required */
2516 0, /* properties_provided */
2517 0, /* properties_destroyed */
2518 0, /* todo_flags_start */
2519 TODO_df_finish | TODO_verify_rtl_sharing |
2520 0, /* todo_flags_finish */
2524 /* Return true if a red-zone is in use. */
2526 static inline bool
2527 ix86_using_red_zone (void)
2529 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2532 /* Return a string that documents the current -m options. The caller is
2533 responsible for freeing the string. */
2535 static char *
2536 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2537 const char *tune, enum fpmath_unit fpmath,
2538 bool add_nl_p)
2540 struct ix86_target_opts
2542 const char *option; /* option string */
2543 HOST_WIDE_INT mask; /* isa mask options */
2546 /* This table is ordered so that options like -msse4.2 that imply
2547 preceding options while match those first. */
2548 static struct ix86_target_opts isa_opts[] =
2550 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2551 { "-mfma", OPTION_MASK_ISA_FMA },
2552 { "-mxop", OPTION_MASK_ISA_XOP },
2553 { "-mlwp", OPTION_MASK_ISA_LWP },
2554 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2555 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2556 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2557 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2558 { "-msse3", OPTION_MASK_ISA_SSE3 },
2559 { "-msse2", OPTION_MASK_ISA_SSE2 },
2560 { "-msse", OPTION_MASK_ISA_SSE },
2561 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2562 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2563 { "-mmmx", OPTION_MASK_ISA_MMX },
2564 { "-mabm", OPTION_MASK_ISA_ABM },
2565 { "-mbmi", OPTION_MASK_ISA_BMI },
2566 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2567 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2568 { "-mhle", OPTION_MASK_ISA_HLE },
2569 { "-mfxsr", OPTION_MASK_ISA_FXSR },
2570 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
2571 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
2572 { "-madx", OPTION_MASK_ISA_ADX },
2573 { "-mtbm", OPTION_MASK_ISA_TBM },
2574 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2575 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2576 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2577 { "-maes", OPTION_MASK_ISA_AES },
2578 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2579 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2580 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2581 { "-mf16c", OPTION_MASK_ISA_F16C },
2582 { "-mrtm", OPTION_MASK_ISA_RTM },
2583 { "-mxsave", OPTION_MASK_ISA_XSAVE },
2584 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
2587 /* Flag options. */
2588 static struct ix86_target_opts flag_opts[] =
2590 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2591 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
2592 { "-m80387", MASK_80387 },
2593 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2594 { "-malign-double", MASK_ALIGN_DOUBLE },
2595 { "-mcld", MASK_CLD },
2596 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2597 { "-mieee-fp", MASK_IEEE_FP },
2598 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2599 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2600 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2601 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2602 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2603 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2604 { "-mno-red-zone", MASK_NO_RED_ZONE },
2605 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2606 { "-mrecip", MASK_RECIP },
2607 { "-mrtd", MASK_RTD },
2608 { "-msseregparm", MASK_SSEREGPARM },
2609 { "-mstack-arg-probe", MASK_STACK_PROBE },
2610 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2611 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2612 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2613 { "-mvzeroupper", MASK_VZEROUPPER },
2614 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2615 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2616 { "-mprefer-avx128", MASK_PREFER_AVX128},
2619 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2621 char isa_other[40];
2622 char target_other[40];
2623 unsigned num = 0;
2624 unsigned i, j;
2625 char *ret;
2626 char *ptr;
2627 size_t len;
2628 size_t line_len;
2629 size_t sep_len;
2630 const char *abi;
2632 memset (opts, '\0', sizeof (opts));
2634 /* Add -march= option. */
2635 if (arch)
2637 opts[num][0] = "-march=";
2638 opts[num++][1] = arch;
2641 /* Add -mtune= option. */
2642 if (tune)
2644 opts[num][0] = "-mtune=";
2645 opts[num++][1] = tune;
2648 /* Add -m32/-m64/-mx32. */
2649 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
2651 if ((isa & OPTION_MASK_ABI_64) != 0)
2652 abi = "-m64";
2653 else
2654 abi = "-mx32";
2655 isa &= ~ (OPTION_MASK_ISA_64BIT
2656 | OPTION_MASK_ABI_64
2657 | OPTION_MASK_ABI_X32);
2659 else
2660 abi = "-m32";
2661 opts[num++][0] = abi;
2663 /* Pick out the options in isa options. */
2664 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2666 if ((isa & isa_opts[i].mask) != 0)
2668 opts[num++][0] = isa_opts[i].option;
2669 isa &= ~ isa_opts[i].mask;
2673 if (isa && add_nl_p)
2675 opts[num++][0] = isa_other;
2676 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2677 isa);
2680 /* Add flag options. */
2681 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2683 if ((flags & flag_opts[i].mask) != 0)
2685 opts[num++][0] = flag_opts[i].option;
2686 flags &= ~ flag_opts[i].mask;
2690 if (flags && add_nl_p)
2692 opts[num++][0] = target_other;
2693 sprintf (target_other, "(other flags: %#x)", flags);
2696 /* Add -fpmath= option. */
2697 if (fpmath)
2699 opts[num][0] = "-mfpmath=";
2700 switch ((int) fpmath)
2702 case FPMATH_387:
2703 opts[num++][1] = "387";
2704 break;
2706 case FPMATH_SSE:
2707 opts[num++][1] = "sse";
2708 break;
2710 case FPMATH_387 | FPMATH_SSE:
2711 opts[num++][1] = "sse+387";
2712 break;
2714 default:
2715 gcc_unreachable ();
2719 /* Any options? */
2720 if (num == 0)
2721 return NULL;
2723 gcc_assert (num < ARRAY_SIZE (opts));
2725 /* Size the string. */
2726 len = 0;
2727 sep_len = (add_nl_p) ? 3 : 1;
2728 for (i = 0; i < num; i++)
2730 len += sep_len;
2731 for (j = 0; j < 2; j++)
2732 if (opts[i][j])
2733 len += strlen (opts[i][j]);
2736 /* Build the string. */
2737 ret = ptr = (char *) xmalloc (len);
2738 line_len = 0;
2740 for (i = 0; i < num; i++)
2742 size_t len2[2];
2744 for (j = 0; j < 2; j++)
2745 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2747 if (i != 0)
2749 *ptr++ = ' ';
2750 line_len++;
2752 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2754 *ptr++ = '\\';
2755 *ptr++ = '\n';
2756 line_len = 0;
2760 for (j = 0; j < 2; j++)
2761 if (opts[i][j])
2763 memcpy (ptr, opts[i][j], len2[j]);
2764 ptr += len2[j];
2765 line_len += len2[j];
2769 *ptr = '\0';
2770 gcc_assert (ret + len >= ptr);
2772 return ret;
2775 /* Return true, if profiling code should be emitted before
2776 prologue. Otherwise it returns false.
2777 Note: For x86 with "hotfix" it is sorried. */
2778 static bool
2779 ix86_profile_before_prologue (void)
2781 return flag_fentry != 0;
2784 /* Function that is callable from the debugger to print the current
2785 options. */
2786 void
2787 ix86_debug_options (void)
2789 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2790 ix86_arch_string, ix86_tune_string,
2791 ix86_fpmath, true);
2793 if (opts)
2795 fprintf (stderr, "%s\n\n", opts);
2796 free (opts);
2798 else
2799 fputs ("<no options>\n\n", stderr);
2801 return;
2804 /* Override various settings based on options. If MAIN_ARGS_P, the
2805 options are from the command line, otherwise they are from
2806 attributes. */
2808 static void
2809 ix86_option_override_internal (bool main_args_p)
2811 int i;
2812 unsigned int ix86_arch_mask, ix86_tune_mask;
2813 const bool ix86_tune_specified = (ix86_tune_string != NULL);
2814 const char *prefix;
2815 const char *suffix;
2816 const char *sw;
2818 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
2819 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
2820 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
2821 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
2822 #define PTA_AES (HOST_WIDE_INT_1 << 4)
2823 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
2824 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
2825 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
2826 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
2827 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
2828 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
2829 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
2830 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
2831 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
2832 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
2833 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
2834 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
2835 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
2836 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
2837 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
2838 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
2839 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
2840 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
2841 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
2842 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
2843 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
2844 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
2845 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
2846 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
2847 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
2848 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
2849 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
2850 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
2851 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
2852 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
2853 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
2854 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
2855 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
2856 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
2857 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
2859 /* if this reaches 64, need to widen struct pta flags below */
2861 static struct pta
2863 const char *const name; /* processor name or nickname. */
2864 const enum processor_type processor;
2865 const enum attr_cpu schedule;
2866 const unsigned HOST_WIDE_INT flags;
2868 const processor_alias_table[] =
2870 {"i386", PROCESSOR_I386, CPU_NONE, 0},
2871 {"i486", PROCESSOR_I486, CPU_NONE, 0},
2872 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2873 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2874 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
2875 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
2876 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2877 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2878 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_SSE},
2879 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2880 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2881 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
2882 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2883 PTA_MMX | PTA_SSE | PTA_FXSR},
2884 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2885 PTA_MMX | PTA_SSE | PTA_FXSR},
2886 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2887 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
2888 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
2889 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
2890 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
2891 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
2892 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
2893 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
2894 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
2895 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2896 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
2897 {"core2", PROCESSOR_CORE2, CPU_CORE2,
2898 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2899 | PTA_SSSE3 | PTA_CX16 | PTA_FXSR},
2900 {"corei7", PROCESSOR_COREI7, CPU_COREI7,
2901 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2902 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_FXSR},
2903 {"corei7-avx", PROCESSOR_COREI7, CPU_COREI7,
2904 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2905 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2906 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL
2907 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
2908 {"core-avx-i", PROCESSOR_COREI7, CPU_COREI7,
2909 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2910 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2911 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2912 | PTA_RDRND | PTA_F16C | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
2913 {"core-avx2", PROCESSOR_COREI7, CPU_COREI7,
2914 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2915 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
2916 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2917 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
2918 | PTA_FMA | PTA_MOVBE | PTA_RTM | PTA_HLE | PTA_FXSR | PTA_XSAVE
2919 | PTA_XSAVEOPT},
2920 {"atom", PROCESSOR_ATOM, CPU_ATOM,
2921 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2922 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE | PTA_FXSR},
2923 {"geode", PROCESSOR_GEODE, CPU_GEODE,
2924 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
2925 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
2926 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
2927 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
2928 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
2929 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
2930 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
2931 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
2932 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
2933 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
2934 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
2935 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
2936 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
2937 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
2938 {"x86-64", PROCESSOR_K8, CPU_K8,
2939 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF},
2940 {"k8", PROCESSOR_K8, CPU_K8,
2941 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2942 | PTA_SSE2 | PTA_NO_SAHF},
2943 {"k8-sse3", PROCESSOR_K8, CPU_K8,
2944 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2945 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
2946 {"opteron", PROCESSOR_K8, CPU_K8,
2947 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2948 | PTA_SSE2 | PTA_NO_SAHF},
2949 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
2950 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2951 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
2952 {"athlon64", PROCESSOR_K8, CPU_K8,
2953 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2954 | PTA_SSE2 | PTA_NO_SAHF},
2955 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
2956 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2957 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
2958 {"athlon-fx", PROCESSOR_K8, CPU_K8,
2959 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2960 | PTA_SSE2 | PTA_NO_SAHF},
2961 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
2962 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2963 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
2964 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
2965 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2966 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
2967 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
2968 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2969 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
2970 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
2971 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
2972 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
2973 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2974 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
2975 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
2976 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
2977 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
2978 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
2979 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2980 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
2981 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
2982 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
2983 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
2984 | PTA_XSAVEOPT},
2985 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC64,
2986 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2987 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
2988 | PTA_FXSR | PTA_XSAVE},
2989 {"btver2", PROCESSOR_BTVER2, CPU_GENERIC64,
2990 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2991 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
2992 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
2993 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
2994 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
2996 {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO,
2997 PTA_HLE /* flags are only used for -march switch. */ },
2998 {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64,
2999 PTA_64BIT
3000 | PTA_HLE /* flags are only used for -march switch. */ },
3003 /* -mrecip options. */
3004 static struct
3006 const char *string; /* option name */
3007 unsigned int mask; /* mask bits to set */
3009 const recip_options[] =
3011 { "all", RECIP_MASK_ALL },
3012 { "none", RECIP_MASK_NONE },
3013 { "div", RECIP_MASK_DIV },
3014 { "sqrt", RECIP_MASK_SQRT },
3015 { "vec-div", RECIP_MASK_VEC_DIV },
3016 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3019 int const pta_size = ARRAY_SIZE (processor_alias_table);
3021 /* Set up prefix/suffix so the error messages refer to either the command
3022 line argument, or the attribute(target). */
3023 if (main_args_p)
3025 prefix = "-m";
3026 suffix = "";
3027 sw = "switch";
3029 else
3031 prefix = "option(\"";
3032 suffix = "\")";
3033 sw = "attribute";
3036 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
3037 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
3038 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT)
3039 ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
3040 #ifdef TARGET_BI_ARCH
3041 else
3043 #if TARGET_BI_ARCH == 1
3044 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
3045 is on and OPTION_MASK_ABI_X32 is off. We turn off
3046 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
3047 -mx32. */
3048 if (TARGET_X32)
3049 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3050 #else
3051 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
3052 on and OPTION_MASK_ABI_64 is off. We turn off
3053 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
3054 -m64. */
3055 if (TARGET_LP64)
3056 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3057 #endif
3059 #endif
3061 if (TARGET_X32)
3063 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3064 OPTION_MASK_ABI_64 for TARGET_X32. */
3065 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3066 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3068 else if (TARGET_LP64)
3070 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3071 OPTION_MASK_ABI_X32 for TARGET_LP64. */
3072 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3073 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3076 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3077 SUBTARGET_OVERRIDE_OPTIONS;
3078 #endif
3080 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3081 SUBSUBTARGET_OVERRIDE_OPTIONS;
3082 #endif
3084 /* -fPIC is the default for x86_64. */
3085 if (TARGET_MACHO && TARGET_64BIT)
3086 flag_pic = 2;
3088 /* Need to check -mtune=generic first. */
3089 if (ix86_tune_string)
3091 if (!strcmp (ix86_tune_string, "generic")
3092 || !strcmp (ix86_tune_string, "i686")
3093 /* As special support for cross compilers we read -mtune=native
3094 as -mtune=generic. With native compilers we won't see the
3095 -mtune=native, as it was changed by the driver. */
3096 || !strcmp (ix86_tune_string, "native"))
3098 if (TARGET_64BIT)
3099 ix86_tune_string = "generic64";
3100 else
3101 ix86_tune_string = "generic32";
3103 /* If this call is for setting the option attribute, allow the
3104 generic32/generic64 that was previously set. */
3105 else if (!main_args_p
3106 && (!strcmp (ix86_tune_string, "generic32")
3107 || !strcmp (ix86_tune_string, "generic64")))
3109 else if (!strncmp (ix86_tune_string, "generic", 7))
3110 error ("bad value (%s) for %stune=%s %s",
3111 ix86_tune_string, prefix, suffix, sw);
3112 else if (!strcmp (ix86_tune_string, "x86-64"))
3113 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3114 "%stune=k8%s or %stune=generic%s instead as appropriate",
3115 prefix, suffix, prefix, suffix, prefix, suffix);
3117 else
3119 if (ix86_arch_string)
3120 ix86_tune_string = ix86_arch_string;
3121 if (!ix86_tune_string)
3123 ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3124 ix86_tune_defaulted = 1;
3127 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
3128 need to use a sensible tune option. */
3129 if (!strcmp (ix86_tune_string, "generic")
3130 || !strcmp (ix86_tune_string, "x86-64")
3131 || !strcmp (ix86_tune_string, "i686"))
3133 if (TARGET_64BIT)
3134 ix86_tune_string = "generic64";
3135 else
3136 ix86_tune_string = "generic32";
3140 if (ix86_stringop_alg == rep_prefix_8_byte && !TARGET_64BIT)
3142 /* rep; movq isn't available in 32-bit code. */
3143 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3144 ix86_stringop_alg = no_stringop;
3147 if (!ix86_arch_string)
3148 ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3149 else
3150 ix86_arch_specified = 1;
3152 if (global_options_set.x_ix86_pmode)
3154 if ((TARGET_LP64 && ix86_pmode == PMODE_SI)
3155 || (!TARGET_64BIT && ix86_pmode == PMODE_DI))
3156 error ("address mode %qs not supported in the %s bit mode",
3157 TARGET_64BIT ? "short" : "long",
3158 TARGET_64BIT ? "64" : "32");
3160 else
3161 ix86_pmode = TARGET_LP64 ? PMODE_DI : PMODE_SI;
3163 if (!global_options_set.x_ix86_abi)
3164 ix86_abi = DEFAULT_ABI;
3166 if (global_options_set.x_ix86_cmodel)
3168 switch (ix86_cmodel)
3170 case CM_SMALL:
3171 case CM_SMALL_PIC:
3172 if (flag_pic)
3173 ix86_cmodel = CM_SMALL_PIC;
3174 if (!TARGET_64BIT)
3175 error ("code model %qs not supported in the %s bit mode",
3176 "small", "32");
3177 break;
3179 case CM_MEDIUM:
3180 case CM_MEDIUM_PIC:
3181 if (flag_pic)
3182 ix86_cmodel = CM_MEDIUM_PIC;
3183 if (!TARGET_64BIT)
3184 error ("code model %qs not supported in the %s bit mode",
3185 "medium", "32");
3186 else if (TARGET_X32)
3187 error ("code model %qs not supported in x32 mode",
3188 "medium");
3189 break;
3191 case CM_LARGE:
3192 case CM_LARGE_PIC:
3193 if (flag_pic)
3194 ix86_cmodel = CM_LARGE_PIC;
3195 if (!TARGET_64BIT)
3196 error ("code model %qs not supported in the %s bit mode",
3197 "large", "32");
3198 else if (TARGET_X32)
3199 error ("code model %qs not supported in x32 mode",
3200 "large");
3201 break;
3203 case CM_32:
3204 if (flag_pic)
3205 error ("code model %s does not support PIC mode", "32");
3206 if (TARGET_64BIT)
3207 error ("code model %qs not supported in the %s bit mode",
3208 "32", "64");
3209 break;
3211 case CM_KERNEL:
3212 if (flag_pic)
3214 error ("code model %s does not support PIC mode", "kernel");
3215 ix86_cmodel = CM_32;
3217 if (!TARGET_64BIT)
3218 error ("code model %qs not supported in the %s bit mode",
3219 "kernel", "32");
3220 break;
3222 default:
3223 gcc_unreachable ();
3226 else
3228 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3229 use of rip-relative addressing. This eliminates fixups that
3230 would otherwise be needed if this object is to be placed in a
3231 DLL, and is essentially just as efficient as direct addressing. */
3232 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
3233 ix86_cmodel = CM_SMALL_PIC, flag_pic = 1;
3234 else if (TARGET_64BIT)
3235 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
3236 else
3237 ix86_cmodel = CM_32;
3239 if (TARGET_MACHO && ix86_asm_dialect == ASM_INTEL)
3241 error ("-masm=intel not supported in this configuration");
3242 ix86_asm_dialect = ASM_ATT;
3244 if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3245 sorry ("%i-bit mode not compiled in",
3246 (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3248 for (i = 0; i < pta_size; i++)
3249 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
3251 ix86_schedule = processor_alias_table[i].schedule;
3252 ix86_arch = processor_alias_table[i].processor;
3253 /* Default cpu tuning to the architecture. */
3254 ix86_tune = ix86_arch;
3256 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
3257 error ("CPU you selected does not support x86-64 "
3258 "instruction set");
3260 if (processor_alias_table[i].flags & PTA_MMX
3261 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3262 ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3263 if (processor_alias_table[i].flags & PTA_3DNOW
3264 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3265 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3266 if (processor_alias_table[i].flags & PTA_3DNOW_A
3267 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3268 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3269 if (processor_alias_table[i].flags & PTA_SSE
3270 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3271 ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3272 if (processor_alias_table[i].flags & PTA_SSE2
3273 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3274 ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3275 if (processor_alias_table[i].flags & PTA_SSE3
3276 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3277 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3278 if (processor_alias_table[i].flags & PTA_SSSE3
3279 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3280 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3281 if (processor_alias_table[i].flags & PTA_SSE4_1
3282 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3283 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3284 if (processor_alias_table[i].flags & PTA_SSE4_2
3285 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3286 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3287 if (processor_alias_table[i].flags & PTA_AVX
3288 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3289 ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3290 if (processor_alias_table[i].flags & PTA_AVX2
3291 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3292 ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3293 if (processor_alias_table[i].flags & PTA_FMA
3294 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3295 ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3296 if (processor_alias_table[i].flags & PTA_SSE4A
3297 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3298 ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3299 if (processor_alias_table[i].flags & PTA_FMA4
3300 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3301 ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3302 if (processor_alias_table[i].flags & PTA_XOP
3303 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3304 ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3305 if (processor_alias_table[i].flags & PTA_LWP
3306 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3307 ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3308 if (processor_alias_table[i].flags & PTA_ABM
3309 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3310 ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3311 if (processor_alias_table[i].flags & PTA_BMI
3312 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3313 ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3314 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3315 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3316 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3317 if (processor_alias_table[i].flags & PTA_TBM
3318 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3319 ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3320 if (processor_alias_table[i].flags & PTA_BMI2
3321 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3322 ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3323 if (processor_alias_table[i].flags & PTA_CX16
3324 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3325 ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3326 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3327 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3328 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3329 if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF))
3330 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3331 ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3332 if (processor_alias_table[i].flags & PTA_MOVBE
3333 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3334 ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3335 if (processor_alias_table[i].flags & PTA_AES
3336 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3337 ix86_isa_flags |= OPTION_MASK_ISA_AES;
3338 if (processor_alias_table[i].flags & PTA_PCLMUL
3339 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3340 ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3341 if (processor_alias_table[i].flags & PTA_FSGSBASE
3342 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3343 ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3344 if (processor_alias_table[i].flags & PTA_RDRND
3345 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3346 ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3347 if (processor_alias_table[i].flags & PTA_F16C
3348 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3349 ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3350 if (processor_alias_table[i].flags & PTA_RTM
3351 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
3352 ix86_isa_flags |= OPTION_MASK_ISA_RTM;
3353 if (processor_alias_table[i].flags & PTA_HLE
3354 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
3355 ix86_isa_flags |= OPTION_MASK_ISA_HLE;
3356 if (processor_alias_table[i].flags & PTA_PRFCHW
3357 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
3358 ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
3359 if (processor_alias_table[i].flags & PTA_RDSEED
3360 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
3361 ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
3362 if (processor_alias_table[i].flags & PTA_ADX
3363 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
3364 ix86_isa_flags |= OPTION_MASK_ISA_ADX;
3365 if (processor_alias_table[i].flags & PTA_FXSR
3366 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
3367 ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
3368 if (processor_alias_table[i].flags & PTA_XSAVE
3369 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
3370 ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
3371 if (processor_alias_table[i].flags & PTA_XSAVEOPT
3372 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
3373 ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
3374 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3375 x86_prefetch_sse = true;
3377 break;
3380 if (!strcmp (ix86_arch_string, "generic"))
3381 error ("generic CPU can be used only for %stune=%s %s",
3382 prefix, suffix, sw);
3383 else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size)
3384 error ("bad value (%s) for %sarch=%s %s",
3385 ix86_arch_string, prefix, suffix, sw);
3387 ix86_arch_mask = 1u << ix86_arch;
3388 for (i = 0; i < X86_ARCH_LAST; ++i)
3389 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3391 for (i = 0; i < pta_size; i++)
3392 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
3394 ix86_schedule = processor_alias_table[i].schedule;
3395 ix86_tune = processor_alias_table[i].processor;
3396 if (TARGET_64BIT)
3398 if (!(processor_alias_table[i].flags & PTA_64BIT))
3400 if (ix86_tune_defaulted)
3402 ix86_tune_string = "x86-64";
3403 for (i = 0; i < pta_size; i++)
3404 if (! strcmp (ix86_tune_string,
3405 processor_alias_table[i].name))
3406 break;
3407 ix86_schedule = processor_alias_table[i].schedule;
3408 ix86_tune = processor_alias_table[i].processor;
3410 else
3411 error ("CPU you selected does not support x86-64 "
3412 "instruction set");
3415 else
3417 /* Adjust tuning when compiling for 32-bit ABI. */
3418 switch (ix86_tune)
3420 case PROCESSOR_GENERIC64:
3421 ix86_tune = PROCESSOR_GENERIC32;
3422 ix86_schedule = CPU_PENTIUMPRO;
3423 break;
3425 default:
3426 break;
3429 /* Intel CPUs have always interpreted SSE prefetch instructions as
3430 NOPs; so, we can enable SSE prefetch instructions even when
3431 -mtune (rather than -march) points us to a processor that has them.
3432 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3433 higher processors. */
3434 if (TARGET_CMOV
3435 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3436 x86_prefetch_sse = true;
3437 break;
3440 if (ix86_tune_specified && i == pta_size)
3441 error ("bad value (%s) for %stune=%s %s",
3442 ix86_tune_string, prefix, suffix, sw);
3444 ix86_tune_mask = 1u << ix86_tune;
3445 for (i = 0; i < X86_TUNE_LAST; ++i)
3446 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3448 #ifndef USE_IX86_FRAME_POINTER
3449 #define USE_IX86_FRAME_POINTER 0
3450 #endif
3452 #ifndef USE_X86_64_FRAME_POINTER
3453 #define USE_X86_64_FRAME_POINTER 0
3454 #endif
3456 /* Set the default values for switches whose default depends on TARGET_64BIT
3457 in case they weren't overwritten by command line options. */
3458 if (TARGET_64BIT)
3460 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3461 flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3462 if (flag_asynchronous_unwind_tables == 2)
3463 flag_unwind_tables = flag_asynchronous_unwind_tables = 1;
3464 if (flag_pcc_struct_return == 2)
3465 flag_pcc_struct_return = 0;
3467 else
3469 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3470 flag_omit_frame_pointer = !(USE_IX86_FRAME_POINTER || optimize_size);
3471 if (flag_asynchronous_unwind_tables == 2)
3472 flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3473 if (flag_pcc_struct_return == 2)
3474 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3477 ix86_tune_cost = processor_target_table[ix86_tune].cost;
3478 if (optimize_size)
3479 ix86_cost = &ix86_size_cost;
3480 else
3481 ix86_cost = ix86_tune_cost;
3483 /* Arrange to set up i386_stack_locals for all functions. */
3484 init_machine_status = ix86_init_machine_status;
3486 /* Validate -mregparm= value. */
3487 if (global_options_set.x_ix86_regparm)
3489 if (TARGET_64BIT)
3490 warning (0, "-mregparm is ignored in 64-bit mode");
3491 if (ix86_regparm > REGPARM_MAX)
3493 error ("-mregparm=%d is not between 0 and %d",
3494 ix86_regparm, REGPARM_MAX);
3495 ix86_regparm = 0;
3498 if (TARGET_64BIT)
3499 ix86_regparm = REGPARM_MAX;
3501 /* Default align_* from the processor table. */
3502 if (align_loops == 0)
3504 align_loops = processor_target_table[ix86_tune].align_loop;
3505 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3507 if (align_jumps == 0)
3509 align_jumps = processor_target_table[ix86_tune].align_jump;
3510 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3512 if (align_functions == 0)
3514 align_functions = processor_target_table[ix86_tune].align_func;
3517 /* Provide default for -mbranch-cost= value. */
3518 if (!global_options_set.x_ix86_branch_cost)
3519 ix86_branch_cost = ix86_cost->branch_cost;
3521 if (TARGET_64BIT)
3523 target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit;
3525 /* Enable by default the SSE and MMX builtins. Do allow the user to
3526 explicitly disable any of these. In particular, disabling SSE and
3527 MMX for kernel code is extremely useful. */
3528 if (!ix86_arch_specified)
3529 ix86_isa_flags
3530 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3531 | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit);
3533 if (TARGET_RTD)
3534 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3536 else
3538 target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit;
3540 if (!ix86_arch_specified)
3541 ix86_isa_flags
3542 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit;
3544 /* i386 ABI does not specify red zone. It still makes sense to use it
3545 when programmer takes care to stack from being destroyed. */
3546 if (!(target_flags_explicit & MASK_NO_RED_ZONE))
3547 target_flags |= MASK_NO_RED_ZONE;
3550 /* Keep nonleaf frame pointers. */
3551 if (flag_omit_frame_pointer)
3552 target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3553 else if (TARGET_OMIT_LEAF_FRAME_POINTER)
3554 flag_omit_frame_pointer = 1;
3556 /* If we're doing fast math, we don't care about comparison order
3557 wrt NaNs. This lets us use a shorter comparison sequence. */
3558 if (flag_finite_math_only)
3559 target_flags &= ~MASK_IEEE_FP;
3561 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3562 since the insns won't need emulation. */
3563 if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
3564 target_flags &= ~MASK_NO_FANCY_MATH_387;
3566 /* Likewise, if the target doesn't have a 387, or we've specified
3567 software floating point, don't use 387 inline intrinsics. */
3568 if (!TARGET_80387)
3569 target_flags |= MASK_NO_FANCY_MATH_387;
3571 /* Turn on MMX builtins for -msse. */
3572 if (TARGET_SSE)
3573 ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit;
3575 /* Enable SSE prefetch. */
3576 if (TARGET_SSE || TARGET_PRFCHW)
3577 x86_prefetch_sse = true;
3579 /* Turn on popcnt instruction for -msse4.2 or -mabm. */
3580 if (TARGET_SSE4_2 || TARGET_ABM)
3581 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT & ~ix86_isa_flags_explicit;
3583 /* Turn on lzcnt instruction for -mabm. */
3584 if (TARGET_ABM)
3585 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT & ~ix86_isa_flags_explicit;
3587 /* Validate -mpreferred-stack-boundary= value or default it to
3588 PREFERRED_STACK_BOUNDARY_DEFAULT. */
3589 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3590 if (global_options_set.x_ix86_preferred_stack_boundary_arg)
3592 int min = (TARGET_64BIT ? (TARGET_SSE ? 4 : 3) : 2);
3593 int max = (TARGET_SEH ? 4 : 12);
3595 if (ix86_preferred_stack_boundary_arg < min
3596 || ix86_preferred_stack_boundary_arg > max)
3598 if (min == max)
3599 error ("-mpreferred-stack-boundary is not supported "
3600 "for this target");
3601 else
3602 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3603 ix86_preferred_stack_boundary_arg, min, max);
3605 else
3606 ix86_preferred_stack_boundary
3607 = (1 << ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3610 /* Set the default value for -mstackrealign. */
3611 if (ix86_force_align_arg_pointer == -1)
3612 ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3614 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3616 /* Validate -mincoming-stack-boundary= value or default it to
3617 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
3618 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3619 if (global_options_set.x_ix86_incoming_stack_boundary_arg)
3621 if (ix86_incoming_stack_boundary_arg < (TARGET_64BIT ? 4 : 2)
3622 || ix86_incoming_stack_boundary_arg > 12)
3623 error ("-mincoming-stack-boundary=%d is not between %d and 12",
3624 ix86_incoming_stack_boundary_arg, TARGET_64BIT ? 4 : 2);
3625 else
3627 ix86_user_incoming_stack_boundary
3628 = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3629 ix86_incoming_stack_boundary
3630 = ix86_user_incoming_stack_boundary;
3634 /* Accept -msseregparm only if at least SSE support is enabled. */
3635 if (TARGET_SSEREGPARM
3636 && ! TARGET_SSE)
3637 error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3639 if (global_options_set.x_ix86_fpmath)
3641 if (ix86_fpmath & FPMATH_SSE)
3643 if (!TARGET_SSE)
3645 warning (0, "SSE instruction set disabled, using 387 arithmetics");
3646 ix86_fpmath = FPMATH_387;
3648 else if ((ix86_fpmath & FPMATH_387) && !TARGET_80387)
3650 warning (0, "387 instruction set disabled, using SSE arithmetics");
3651 ix86_fpmath = FPMATH_SSE;
3655 else
3656 ix86_fpmath = TARGET_FPMATH_DEFAULT;
3658 /* If the i387 is disabled, then do not return values in it. */
3659 if (!TARGET_80387)
3660 target_flags &= ~MASK_FLOAT_RETURNS;
3662 /* Use external vectorized library in vectorizing intrinsics. */
3663 if (global_options_set.x_ix86_veclibabi_type)
3664 switch (ix86_veclibabi_type)
3666 case ix86_veclibabi_type_svml:
3667 ix86_veclib_handler = ix86_veclibabi_svml;
3668 break;
3670 case ix86_veclibabi_type_acml:
3671 ix86_veclib_handler = ix86_veclibabi_acml;
3672 break;
3674 default:
3675 gcc_unreachable ();
3678 if ((!USE_IX86_FRAME_POINTER
3679 || (x86_accumulate_outgoing_args & ix86_tune_mask))
3680 && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3681 && !optimize_size)
3682 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3684 /* ??? Unwind info is not correct around the CFG unless either a frame
3685 pointer is present or M_A_O_A is set. Fixing this requires rewriting
3686 unwind info generation to be aware of the CFG and propagating states
3687 around edges. */
3688 if ((flag_unwind_tables || flag_asynchronous_unwind_tables
3689 || flag_exceptions || flag_non_call_exceptions)
3690 && flag_omit_frame_pointer
3691 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3693 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3694 warning (0, "unwind tables currently require either a frame pointer "
3695 "or %saccumulate-outgoing-args%s for correctness",
3696 prefix, suffix);
3697 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3700 /* If stack probes are required, the space used for large function
3701 arguments on the stack must also be probed, so enable
3702 -maccumulate-outgoing-args so this happens in the prologue. */
3703 if (TARGET_STACK_PROBE
3704 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3706 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3707 warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3708 "for correctness", prefix, suffix);
3709 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3712 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
3714 char *p;
3715 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3716 p = strchr (internal_label_prefix, 'X');
3717 internal_label_prefix_len = p - internal_label_prefix;
3718 *p = '\0';
3721 /* When scheduling description is not available, disable scheduler pass
3722 so it won't slow down the compilation and make x87 code slower. */
3723 if (!TARGET_SCHEDULE)
3724 flag_schedule_insns_after_reload = flag_schedule_insns = 0;
3726 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3727 ix86_tune_cost->simultaneous_prefetches,
3728 global_options.x_param_values,
3729 global_options_set.x_param_values);
3730 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
3731 ix86_tune_cost->prefetch_block,
3732 global_options.x_param_values,
3733 global_options_set.x_param_values);
3734 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
3735 ix86_tune_cost->l1_cache_size,
3736 global_options.x_param_values,
3737 global_options_set.x_param_values);
3738 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
3739 ix86_tune_cost->l2_cache_size,
3740 global_options.x_param_values,
3741 global_options_set.x_param_values);
3743 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
3744 if (flag_prefetch_loop_arrays < 0
3745 && HAVE_prefetch
3746 && (optimize >= 3 || flag_profile_use)
3747 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
3748 flag_prefetch_loop_arrays = 1;
3750 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
3751 can be optimized to ap = __builtin_next_arg (0). */
3752 if (!TARGET_64BIT && !flag_split_stack)
3753 targetm.expand_builtin_va_start = NULL;
3755 if (TARGET_64BIT)
3757 ix86_gen_leave = gen_leave_rex64;
3758 if (Pmode == DImode)
3760 ix86_gen_monitor = gen_sse3_monitor64_di;
3761 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
3762 ix86_gen_tls_local_dynamic_base_64
3763 = gen_tls_local_dynamic_base_64_di;
3765 else
3767 ix86_gen_monitor = gen_sse3_monitor64_si;
3768 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
3769 ix86_gen_tls_local_dynamic_base_64
3770 = gen_tls_local_dynamic_base_64_si;
3773 else
3775 ix86_gen_leave = gen_leave;
3776 ix86_gen_monitor = gen_sse3_monitor;
3779 if (Pmode == DImode)
3781 ix86_gen_add3 = gen_adddi3;
3782 ix86_gen_sub3 = gen_subdi3;
3783 ix86_gen_sub3_carry = gen_subdi3_carry;
3784 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
3785 ix86_gen_andsp = gen_anddi3;
3786 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
3787 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
3788 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
3790 else
3792 ix86_gen_add3 = gen_addsi3;
3793 ix86_gen_sub3 = gen_subsi3;
3794 ix86_gen_sub3_carry = gen_subsi3_carry;
3795 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
3796 ix86_gen_andsp = gen_andsi3;
3797 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
3798 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
3799 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
3802 #ifdef USE_IX86_CLD
3803 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
3804 if (!TARGET_64BIT)
3805 target_flags |= MASK_CLD & ~target_flags_explicit;
3806 #endif
3808 if (!TARGET_64BIT && flag_pic)
3810 if (flag_fentry > 0)
3811 sorry ("-mfentry isn%'t supported for 32-bit in combination "
3812 "with -fpic");
3813 flag_fentry = 0;
3815 else if (TARGET_SEH)
3817 if (flag_fentry == 0)
3818 sorry ("-mno-fentry isn%'t compatible with SEH");
3819 flag_fentry = 1;
3821 else if (flag_fentry < 0)
3823 #if defined(PROFILE_BEFORE_PROLOGUE)
3824 flag_fentry = 1;
3825 #else
3826 flag_fentry = 0;
3827 #endif
3830 if (TARGET_AVX)
3832 /* When not optimize for size, enable vzeroupper optimization for
3833 TARGET_AVX with -fexpensive-optimizations and split 32-byte
3834 AVX unaligned load/store. */
3835 if (!optimize_size)
3837 if (flag_expensive_optimizations
3838 && !(target_flags_explicit & MASK_VZEROUPPER))
3839 target_flags |= MASK_VZEROUPPER;
3840 if ((x86_avx256_split_unaligned_load & ix86_tune_mask)
3841 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
3842 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
3843 if ((x86_avx256_split_unaligned_store & ix86_tune_mask)
3844 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE))
3845 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
3846 /* Enable 128-bit AVX instruction generation
3847 for the auto-vectorizer. */
3848 if (TARGET_AVX128_OPTIMAL
3849 && !(target_flags_explicit & MASK_PREFER_AVX128))
3850 target_flags |= MASK_PREFER_AVX128;
3853 else
3855 /* Disable vzeroupper pass if TARGET_AVX is disabled. */
3856 target_flags &= ~MASK_VZEROUPPER;
3859 if (ix86_recip_name)
3861 char *p = ASTRDUP (ix86_recip_name);
3862 char *q;
3863 unsigned int mask, i;
3864 bool invert;
3866 while ((q = strtok (p, ",")) != NULL)
3868 p = NULL;
3869 if (*q == '!')
3871 invert = true;
3872 q++;
3874 else
3875 invert = false;
3877 if (!strcmp (q, "default"))
3878 mask = RECIP_MASK_ALL;
3879 else
3881 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
3882 if (!strcmp (q, recip_options[i].string))
3884 mask = recip_options[i].mask;
3885 break;
3888 if (i == ARRAY_SIZE (recip_options))
3890 error ("unknown option for -mrecip=%s", q);
3891 invert = false;
3892 mask = RECIP_MASK_NONE;
3896 recip_mask_explicit |= mask;
3897 if (invert)
3898 recip_mask &= ~mask;
3899 else
3900 recip_mask |= mask;
3904 if (TARGET_RECIP)
3905 recip_mask |= RECIP_MASK_ALL & ~recip_mask_explicit;
3906 else if (target_flags_explicit & MASK_RECIP)
3907 recip_mask &= ~(RECIP_MASK_ALL & ~recip_mask_explicit);
3909 /* Default long double to 64-bit for Bionic. */
3910 if (TARGET_HAS_BIONIC
3911 && !(target_flags_explicit & MASK_LONG_DOUBLE_64))
3912 target_flags |= MASK_LONG_DOUBLE_64;
3914 /* Save the initial options in case the user does function specific
3915 options. */
3916 if (main_args_p)
3917 target_option_default_node = target_option_current_node
3918 = build_target_option_node ();
3921 /* Implement the TARGET_OPTION_OVERRIDE hook. */
3923 static void
3924 ix86_option_override (void)
3926 static struct register_pass_info insert_vzeroupper_info
3927 = { &pass_insert_vzeroupper.pass, "reload",
3928 1, PASS_POS_INSERT_AFTER
3931 ix86_option_override_internal (true);
3934 /* This needs to be done at start up. It's convenient to do it here. */
3935 register_pass (&insert_vzeroupper_info);
3938 /* Update register usage after having seen the compiler flags. */
3940 static void
3941 ix86_conditional_register_usage (void)
3943 int i, c_mask;
3944 unsigned int j;
3946 /* The PIC register, if it exists, is fixed. */
3947 j = PIC_OFFSET_TABLE_REGNUM;
3948 if (j != INVALID_REGNUM)
3949 fixed_regs[j] = call_used_regs[j] = 1;
3951 /* For 32-bit targets, squash the REX registers. */
3952 if (! TARGET_64BIT)
3954 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
3955 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3956 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
3957 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3960 /* See the definition of CALL_USED_REGISTERS in i386.h. */
3961 c_mask = (TARGET_64BIT_MS_ABI ? (1 << 3)
3962 : TARGET_64BIT ? (1 << 2)
3963 : (1 << 1));
3965 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
3967 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3969 /* Set/reset conditionally defined registers from
3970 CALL_USED_REGISTERS initializer. */
3971 if (call_used_regs[i] > 1)
3972 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
3974 /* Calculate registers of CLOBBERED_REGS register set
3975 as call used registers from GENERAL_REGS register set. */
3976 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
3977 && call_used_regs[i])
3978 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
3981 /* If MMX is disabled, squash the registers. */
3982 if (! TARGET_MMX)
3983 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3984 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
3985 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3987 /* If SSE is disabled, squash the registers. */
3988 if (! TARGET_SSE)
3989 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3990 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
3991 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3993 /* If the FPU is disabled, squash the registers. */
3994 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
3995 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3996 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
3997 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4001 /* Save the current options */
4003 static void
4004 ix86_function_specific_save (struct cl_target_option *ptr)
4006 ptr->arch = ix86_arch;
4007 ptr->schedule = ix86_schedule;
4008 ptr->tune = ix86_tune;
4009 ptr->branch_cost = ix86_branch_cost;
4010 ptr->tune_defaulted = ix86_tune_defaulted;
4011 ptr->arch_specified = ix86_arch_specified;
4012 ptr->x_ix86_isa_flags_explicit = ix86_isa_flags_explicit;
4013 ptr->ix86_target_flags_explicit = target_flags_explicit;
4014 ptr->x_recip_mask_explicit = recip_mask_explicit;
4016 /* The fields are char but the variables are not; make sure the
4017 values fit in the fields. */
4018 gcc_assert (ptr->arch == ix86_arch);
4019 gcc_assert (ptr->schedule == ix86_schedule);
4020 gcc_assert (ptr->tune == ix86_tune);
4021 gcc_assert (ptr->branch_cost == ix86_branch_cost);
4024 /* Restore the current options */
4026 static void
4027 ix86_function_specific_restore (struct cl_target_option *ptr)
4029 enum processor_type old_tune = ix86_tune;
4030 enum processor_type old_arch = ix86_arch;
4031 unsigned int ix86_arch_mask, ix86_tune_mask;
4032 int i;
4034 ix86_arch = (enum processor_type) ptr->arch;
4035 ix86_schedule = (enum attr_cpu) ptr->schedule;
4036 ix86_tune = (enum processor_type) ptr->tune;
4037 ix86_branch_cost = ptr->branch_cost;
4038 ix86_tune_defaulted = ptr->tune_defaulted;
4039 ix86_arch_specified = ptr->arch_specified;
4040 ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
4041 target_flags_explicit = ptr->ix86_target_flags_explicit;
4042 recip_mask_explicit = ptr->x_recip_mask_explicit;
4044 /* Recreate the arch feature tests if the arch changed */
4045 if (old_arch != ix86_arch)
4047 ix86_arch_mask = 1u << ix86_arch;
4048 for (i = 0; i < X86_ARCH_LAST; ++i)
4049 ix86_arch_features[i]
4050 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4053 /* Recreate the tune optimization tests */
4054 if (old_tune != ix86_tune)
4056 ix86_tune_mask = 1u << ix86_tune;
4057 for (i = 0; i < X86_TUNE_LAST; ++i)
4058 ix86_tune_features[i]
4059 = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
4063 /* Print the current options */
4065 static void
4066 ix86_function_specific_print (FILE *file, int indent,
4067 struct cl_target_option *ptr)
4069 char *target_string
4070 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
4071 NULL, NULL, ptr->x_ix86_fpmath, false);
4073 fprintf (file, "%*sarch = %d (%s)\n",
4074 indent, "",
4075 ptr->arch,
4076 ((ptr->arch < TARGET_CPU_DEFAULT_max)
4077 ? cpu_names[ptr->arch]
4078 : "<unknown>"));
4080 fprintf (file, "%*stune = %d (%s)\n",
4081 indent, "",
4082 ptr->tune,
4083 ((ptr->tune < TARGET_CPU_DEFAULT_max)
4084 ? cpu_names[ptr->tune]
4085 : "<unknown>"));
4087 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
4089 if (target_string)
4091 fprintf (file, "%*s%s\n", indent, "", target_string);
4092 free (target_string);
4097 /* Inner function to process the attribute((target(...))), take an argument and
4098 set the current options from the argument. If we have a list, recursively go
4099 over the list. */
4101 static bool
4102 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4103 struct gcc_options *enum_opts_set)
4105 char *next_optstr;
4106 bool ret = true;
4108 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4109 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4110 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4111 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4112 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
4114 enum ix86_opt_type
4116 ix86_opt_unknown,
4117 ix86_opt_yes,
4118 ix86_opt_no,
4119 ix86_opt_str,
4120 ix86_opt_enum,
4121 ix86_opt_isa
4124 static const struct
4126 const char *string;
4127 size_t len;
4128 enum ix86_opt_type type;
4129 int opt;
4130 int mask;
4131 } attrs[] = {
4132 /* isa options */
4133 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
4134 IX86_ATTR_ISA ("abm", OPT_mabm),
4135 IX86_ATTR_ISA ("bmi", OPT_mbmi),
4136 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
4137 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
4138 IX86_ATTR_ISA ("tbm", OPT_mtbm),
4139 IX86_ATTR_ISA ("aes", OPT_maes),
4140 IX86_ATTR_ISA ("avx", OPT_mavx),
4141 IX86_ATTR_ISA ("avx2", OPT_mavx2),
4142 IX86_ATTR_ISA ("mmx", OPT_mmmx),
4143 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
4144 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
4145 IX86_ATTR_ISA ("sse", OPT_msse),
4146 IX86_ATTR_ISA ("sse2", OPT_msse2),
4147 IX86_ATTR_ISA ("sse3", OPT_msse3),
4148 IX86_ATTR_ISA ("sse4", OPT_msse4),
4149 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
4150 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
4151 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
4152 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
4153 IX86_ATTR_ISA ("fma4", OPT_mfma4),
4154 IX86_ATTR_ISA ("fma", OPT_mfma),
4155 IX86_ATTR_ISA ("xop", OPT_mxop),
4156 IX86_ATTR_ISA ("lwp", OPT_mlwp),
4157 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
4158 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
4159 IX86_ATTR_ISA ("f16c", OPT_mf16c),
4160 IX86_ATTR_ISA ("rtm", OPT_mrtm),
4161 IX86_ATTR_ISA ("hle", OPT_mhle),
4162 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
4163 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
4164 IX86_ATTR_ISA ("adx", OPT_madx),
4165 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
4166 IX86_ATTR_ISA ("xsave", OPT_mxsave),
4167 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
4169 /* enum options */
4170 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
4172 /* string options */
4173 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
4174 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
4176 /* flag options */
4177 IX86_ATTR_YES ("cld",
4178 OPT_mcld,
4179 MASK_CLD),
4181 IX86_ATTR_NO ("fancy-math-387",
4182 OPT_mfancy_math_387,
4183 MASK_NO_FANCY_MATH_387),
4185 IX86_ATTR_YES ("ieee-fp",
4186 OPT_mieee_fp,
4187 MASK_IEEE_FP),
4189 IX86_ATTR_YES ("inline-all-stringops",
4190 OPT_minline_all_stringops,
4191 MASK_INLINE_ALL_STRINGOPS),
4193 IX86_ATTR_YES ("inline-stringops-dynamically",
4194 OPT_minline_stringops_dynamically,
4195 MASK_INLINE_STRINGOPS_DYNAMICALLY),
4197 IX86_ATTR_NO ("align-stringops",
4198 OPT_mno_align_stringops,
4199 MASK_NO_ALIGN_STRINGOPS),
4201 IX86_ATTR_YES ("recip",
4202 OPT_mrecip,
4203 MASK_RECIP),
4207 /* If this is a list, recurse to get the options. */
4208 if (TREE_CODE (args) == TREE_LIST)
4210 bool ret = true;
4212 for (; args; args = TREE_CHAIN (args))
4213 if (TREE_VALUE (args)
4214 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4215 p_strings, enum_opts_set))
4216 ret = false;
4218 return ret;
4221 else if (TREE_CODE (args) != STRING_CST)
4222 gcc_unreachable ();
4224 /* Handle multiple arguments separated by commas. */
4225 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4227 while (next_optstr && *next_optstr != '\0')
4229 char *p = next_optstr;
4230 char *orig_p = p;
4231 char *comma = strchr (next_optstr, ',');
4232 const char *opt_string;
4233 size_t len, opt_len;
4234 int opt;
4235 bool opt_set_p;
4236 char ch;
4237 unsigned i;
4238 enum ix86_opt_type type = ix86_opt_unknown;
4239 int mask = 0;
4241 if (comma)
4243 *comma = '\0';
4244 len = comma - next_optstr;
4245 next_optstr = comma + 1;
4247 else
4249 len = strlen (p);
4250 next_optstr = NULL;
4253 /* Recognize no-xxx. */
4254 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4256 opt_set_p = false;
4257 p += 3;
4258 len -= 3;
4260 else
4261 opt_set_p = true;
4263 /* Find the option. */
4264 ch = *p;
4265 opt = N_OPTS;
4266 for (i = 0; i < ARRAY_SIZE (attrs); i++)
4268 type = attrs[i].type;
4269 opt_len = attrs[i].len;
4270 if (ch == attrs[i].string[0]
4271 && ((type != ix86_opt_str && type != ix86_opt_enum)
4272 ? len == opt_len
4273 : len > opt_len)
4274 && memcmp (p, attrs[i].string, opt_len) == 0)
4276 opt = attrs[i].opt;
4277 mask = attrs[i].mask;
4278 opt_string = attrs[i].string;
4279 break;
4283 /* Process the option. */
4284 if (opt == N_OPTS)
4286 error ("attribute(target(\"%s\")) is unknown", orig_p);
4287 ret = false;
4290 else if (type == ix86_opt_isa)
4292 struct cl_decoded_option decoded;
4294 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4295 ix86_handle_option (&global_options, &global_options_set,
4296 &decoded, input_location);
4299 else if (type == ix86_opt_yes || type == ix86_opt_no)
4301 if (type == ix86_opt_no)
4302 opt_set_p = !opt_set_p;
4304 if (opt_set_p)
4305 target_flags |= mask;
4306 else
4307 target_flags &= ~mask;
4310 else if (type == ix86_opt_str)
4312 if (p_strings[opt])
4314 error ("option(\"%s\") was already specified", opt_string);
4315 ret = false;
4317 else
4318 p_strings[opt] = xstrdup (p + opt_len);
4321 else if (type == ix86_opt_enum)
4323 bool arg_ok;
4324 int value;
4326 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4327 if (arg_ok)
4328 set_option (&global_options, enum_opts_set, opt, value,
4329 p + opt_len, DK_UNSPECIFIED, input_location,
4330 global_dc);
4331 else
4333 error ("attribute(target(\"%s\")) is unknown", orig_p);
4334 ret = false;
4338 else
4339 gcc_unreachable ();
4342 return ret;
4345 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
4347 tree
4348 ix86_valid_target_attribute_tree (tree args)
4350 const char *orig_arch_string = ix86_arch_string;
4351 const char *orig_tune_string = ix86_tune_string;
4352 enum fpmath_unit orig_fpmath_set = global_options_set.x_ix86_fpmath;
4353 int orig_tune_defaulted = ix86_tune_defaulted;
4354 int orig_arch_specified = ix86_arch_specified;
4355 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4356 tree t = NULL_TREE;
4357 int i;
4358 struct cl_target_option *def
4359 = TREE_TARGET_OPTION (target_option_default_node);
4360 struct gcc_options enum_opts_set;
4362 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4364 /* Process each of the options on the chain. */
4365 if (! ix86_valid_target_attribute_inner_p (args, option_strings,
4366 &enum_opts_set))
4367 return NULL_TREE;
4369 /* If the changed options are different from the default, rerun
4370 ix86_option_override_internal, and then save the options away.
4371 The string options are are attribute options, and will be undone
4372 when we copy the save structure. */
4373 if (ix86_isa_flags != def->x_ix86_isa_flags
4374 || target_flags != def->x_target_flags
4375 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4376 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4377 || enum_opts_set.x_ix86_fpmath)
4379 /* If we are using the default tune= or arch=, undo the string assigned,
4380 and use the default. */
4381 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4382 ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4383 else if (!orig_arch_specified)
4384 ix86_arch_string = NULL;
4386 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4387 ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4388 else if (orig_tune_defaulted)
4389 ix86_tune_string = NULL;
4391 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
4392 if (enum_opts_set.x_ix86_fpmath)
4393 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4394 else if (!TARGET_64BIT && TARGET_SSE)
4396 ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4397 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4400 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
4401 ix86_option_override_internal (false);
4403 /* Add any builtin functions with the new isa if any. */
4404 ix86_add_new_builtins (ix86_isa_flags);
4406 /* Save the current options unless we are validating options for
4407 #pragma. */
4408 t = build_target_option_node ();
4410 ix86_arch_string = orig_arch_string;
4411 ix86_tune_string = orig_tune_string;
4412 global_options_set.x_ix86_fpmath = orig_fpmath_set;
4414 /* Free up memory allocated to hold the strings */
4415 for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4416 free (option_strings[i]);
4419 return t;
4422 /* Hook to validate attribute((target("string"))). */
4424 static bool
4425 ix86_valid_target_attribute_p (tree fndecl,
4426 tree ARG_UNUSED (name),
4427 tree args,
4428 int ARG_UNUSED (flags))
4430 struct cl_target_option cur_target;
4431 bool ret = true;
4432 tree old_optimize = build_optimization_node ();
4433 tree new_target, new_optimize;
4434 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4436 /* If the function changed the optimization levels as well as setting target
4437 options, start with the optimizations specified. */
4438 if (func_optimize && func_optimize != old_optimize)
4439 cl_optimization_restore (&global_options,
4440 TREE_OPTIMIZATION (func_optimize));
4442 /* The target attributes may also change some optimization flags, so update
4443 the optimization options if necessary. */
4444 cl_target_option_save (&cur_target, &global_options);
4445 new_target = ix86_valid_target_attribute_tree (args);
4446 new_optimize = build_optimization_node ();
4448 if (!new_target)
4449 ret = false;
4451 else if (fndecl)
4453 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4455 if (old_optimize != new_optimize)
4456 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4459 cl_target_option_restore (&global_options, &cur_target);
4461 if (old_optimize != new_optimize)
4462 cl_optimization_restore (&global_options,
4463 TREE_OPTIMIZATION (old_optimize));
4465 return ret;
4469 /* Hook to determine if one function can safely inline another. */
4471 static bool
4472 ix86_can_inline_p (tree caller, tree callee)
4474 bool ret = false;
4475 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4476 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4478 /* If callee has no option attributes, then it is ok to inline. */
4479 if (!callee_tree)
4480 ret = true;
4482 /* If caller has no option attributes, but callee does then it is not ok to
4483 inline. */
4484 else if (!caller_tree)
4485 ret = false;
4487 else
4489 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4490 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4492 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4493 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4494 function. */
4495 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4496 != callee_opts->x_ix86_isa_flags)
4497 ret = false;
4499 /* See if we have the same non-isa options. */
4500 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4501 ret = false;
4503 /* See if arch, tune, etc. are the same. */
4504 else if (caller_opts->arch != callee_opts->arch)
4505 ret = false;
4507 else if (caller_opts->tune != callee_opts->tune)
4508 ret = false;
4510 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4511 ret = false;
4513 else if (caller_opts->branch_cost != callee_opts->branch_cost)
4514 ret = false;
4516 else
4517 ret = true;
4520 return ret;
4524 /* Remember the last target of ix86_set_current_function. */
4525 static GTY(()) tree ix86_previous_fndecl;
4527 /* Establish appropriate back-end context for processing the function
4528 FNDECL. The argument might be NULL to indicate processing at top
4529 level, outside of any function scope. */
4530 static void
4531 ix86_set_current_function (tree fndecl)
4533 /* Only change the context if the function changes. This hook is called
4534 several times in the course of compiling a function, and we don't want to
4535 slow things down too much or call target_reinit when it isn't safe. */
4536 if (fndecl && fndecl != ix86_previous_fndecl)
4538 tree old_tree = (ix86_previous_fndecl
4539 ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4540 : NULL_TREE);
4542 tree new_tree = (fndecl
4543 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4544 : NULL_TREE);
4546 ix86_previous_fndecl = fndecl;
4547 if (old_tree == new_tree)
4550 else if (new_tree)
4552 cl_target_option_restore (&global_options,
4553 TREE_TARGET_OPTION (new_tree));
4554 target_reinit ();
4557 else if (old_tree)
4559 struct cl_target_option *def
4560 = TREE_TARGET_OPTION (target_option_current_node);
4562 cl_target_option_restore (&global_options, def);
4563 target_reinit ();
4569 /* Return true if this goes in large data/bss. */
4571 static bool
4572 ix86_in_large_data_p (tree exp)
4574 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4575 return false;
4577 /* Functions are never large data. */
4578 if (TREE_CODE (exp) == FUNCTION_DECL)
4579 return false;
4581 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4583 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4584 if (strcmp (section, ".ldata") == 0
4585 || strcmp (section, ".lbss") == 0)
4586 return true;
4587 return false;
4589 else
4591 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4593 /* If this is an incomplete type with size 0, then we can't put it
4594 in data because it might be too big when completed. */
4595 if (!size || size > ix86_section_threshold)
4596 return true;
4599 return false;
4602 /* Switch to the appropriate section for output of DECL.
4603 DECL is either a `VAR_DECL' node or a constant of some sort.
4604 RELOC indicates whether forming the initial value of DECL requires
4605 link-time relocations. */
4607 static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT)
4608 ATTRIBUTE_UNUSED;
4610 static section *
4611 x86_64_elf_select_section (tree decl, int reloc,
4612 unsigned HOST_WIDE_INT align)
4614 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4615 && ix86_in_large_data_p (decl))
4617 const char *sname = NULL;
4618 unsigned int flags = SECTION_WRITE;
4619 switch (categorize_decl_for_section (decl, reloc))
4621 case SECCAT_DATA:
4622 sname = ".ldata";
4623 break;
4624 case SECCAT_DATA_REL:
4625 sname = ".ldata.rel";
4626 break;
4627 case SECCAT_DATA_REL_LOCAL:
4628 sname = ".ldata.rel.local";
4629 break;
4630 case SECCAT_DATA_REL_RO:
4631 sname = ".ldata.rel.ro";
4632 break;
4633 case SECCAT_DATA_REL_RO_LOCAL:
4634 sname = ".ldata.rel.ro.local";
4635 break;
4636 case SECCAT_BSS:
4637 sname = ".lbss";
4638 flags |= SECTION_BSS;
4639 break;
4640 case SECCAT_RODATA:
4641 case SECCAT_RODATA_MERGE_STR:
4642 case SECCAT_RODATA_MERGE_STR_INIT:
4643 case SECCAT_RODATA_MERGE_CONST:
4644 sname = ".lrodata";
4645 flags = 0;
4646 break;
4647 case SECCAT_SRODATA:
4648 case SECCAT_SDATA:
4649 case SECCAT_SBSS:
4650 gcc_unreachable ();
4651 case SECCAT_TEXT:
4652 case SECCAT_TDATA:
4653 case SECCAT_TBSS:
4654 /* We don't split these for medium model. Place them into
4655 default sections and hope for best. */
4656 break;
4658 if (sname)
4660 /* We might get called with string constants, but get_named_section
4661 doesn't like them as they are not DECLs. Also, we need to set
4662 flags in that case. */
4663 if (!DECL_P (decl))
4664 return get_section (sname, flags, NULL);
4665 return get_named_section (decl, sname, reloc);
4668 return default_elf_select_section (decl, reloc, align);
4671 /* Build up a unique section name, expressed as a
4672 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
4673 RELOC indicates whether the initial value of EXP requires
4674 link-time relocations. */
4676 static void ATTRIBUTE_UNUSED
4677 x86_64_elf_unique_section (tree decl, int reloc)
4679 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4680 && ix86_in_large_data_p (decl))
4682 const char *prefix = NULL;
4683 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
4684 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
4686 switch (categorize_decl_for_section (decl, reloc))
4688 case SECCAT_DATA:
4689 case SECCAT_DATA_REL:
4690 case SECCAT_DATA_REL_LOCAL:
4691 case SECCAT_DATA_REL_RO:
4692 case SECCAT_DATA_REL_RO_LOCAL:
4693 prefix = one_only ? ".ld" : ".ldata";
4694 break;
4695 case SECCAT_BSS:
4696 prefix = one_only ? ".lb" : ".lbss";
4697 break;
4698 case SECCAT_RODATA:
4699 case SECCAT_RODATA_MERGE_STR:
4700 case SECCAT_RODATA_MERGE_STR_INIT:
4701 case SECCAT_RODATA_MERGE_CONST:
4702 prefix = one_only ? ".lr" : ".lrodata";
4703 break;
4704 case SECCAT_SRODATA:
4705 case SECCAT_SDATA:
4706 case SECCAT_SBSS:
4707 gcc_unreachable ();
4708 case SECCAT_TEXT:
4709 case SECCAT_TDATA:
4710 case SECCAT_TBSS:
4711 /* We don't split these for medium model. Place them into
4712 default sections and hope for best. */
4713 break;
4715 if (prefix)
4717 const char *name, *linkonce;
4718 char *string;
4720 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
4721 name = targetm.strip_name_encoding (name);
4723 /* If we're using one_only, then there needs to be a .gnu.linkonce
4724 prefix to the section name. */
4725 linkonce = one_only ? ".gnu.linkonce" : "";
4727 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
4729 DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
4730 return;
4733 default_unique_section (decl, reloc);
4736 #ifdef COMMON_ASM_OP
4737 /* This says how to output assembler code to declare an
4738 uninitialized external linkage data object.
4740 For medium model x86-64 we need to use .largecomm opcode for
4741 large objects. */
4742 void
4743 x86_elf_aligned_common (FILE *file,
4744 const char *name, unsigned HOST_WIDE_INT size,
4745 int align)
4747 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4748 && size > (unsigned int)ix86_section_threshold)
4749 fputs (".largecomm\t", file);
4750 else
4751 fputs (COMMON_ASM_OP, file);
4752 assemble_name (file, name);
4753 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
4754 size, align / BITS_PER_UNIT);
4756 #endif
4758 /* Utility function for targets to use in implementing
4759 ASM_OUTPUT_ALIGNED_BSS. */
4761 void
4762 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
4763 const char *name, unsigned HOST_WIDE_INT size,
4764 int align)
4766 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4767 && size > (unsigned int)ix86_section_threshold)
4768 switch_to_section (get_named_section (decl, ".lbss", 0));
4769 else
4770 switch_to_section (bss_section);
4771 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
4772 #ifdef ASM_DECLARE_OBJECT_NAME
4773 last_assemble_variable_decl = decl;
4774 ASM_DECLARE_OBJECT_NAME (file, name, decl);
4775 #else
4776 /* Standard thing is just output label for the object. */
4777 ASM_OUTPUT_LABEL (file, name);
4778 #endif /* ASM_DECLARE_OBJECT_NAME */
4779 ASM_OUTPUT_SKIP (file, size ? size : 1);
4782 /* Decide whether we must probe the stack before any space allocation
4783 on this target. It's essentially TARGET_STACK_PROBE except when
4784 -fstack-check causes the stack to be already probed differently. */
4786 bool
4787 ix86_target_stack_probe (void)
4789 /* Do not probe the stack twice if static stack checking is enabled. */
4790 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4791 return false;
4793 return TARGET_STACK_PROBE;
4796 /* Decide whether we can make a sibling call to a function. DECL is the
4797 declaration of the function being targeted by the call and EXP is the
4798 CALL_EXPR representing the call. */
4800 static bool
4801 ix86_function_ok_for_sibcall (tree decl, tree exp)
4803 tree type, decl_or_type;
4804 rtx a, b;
4806 /* If we are generating position-independent code, we cannot sibcall
4807 optimize any indirect call, or a direct call to a global function,
4808 as the PLT requires %ebx be live. (Darwin does not have a PLT.) */
4809 if (!TARGET_MACHO
4810 && !TARGET_64BIT
4811 && flag_pic
4812 && (!decl || !targetm.binds_local_p (decl)))
4813 return false;
4815 /* If we need to align the outgoing stack, then sibcalling would
4816 unalign the stack, which may break the called function. */
4817 if (ix86_minimum_incoming_stack_boundary (true)
4818 < PREFERRED_STACK_BOUNDARY)
4819 return false;
4821 if (decl)
4823 decl_or_type = decl;
4824 type = TREE_TYPE (decl);
4826 else
4828 /* We're looking at the CALL_EXPR, we need the type of the function. */
4829 type = CALL_EXPR_FN (exp); /* pointer expression */
4830 type = TREE_TYPE (type); /* pointer type */
4831 type = TREE_TYPE (type); /* function type */
4832 decl_or_type = type;
4835 /* Check that the return value locations are the same. Like
4836 if we are returning floats on the 80387 register stack, we cannot
4837 make a sibcall from a function that doesn't return a float to a
4838 function that does or, conversely, from a function that does return
4839 a float to a function that doesn't; the necessary stack adjustment
4840 would not be executed. This is also the place we notice
4841 differences in the return value ABI. Note that it is ok for one
4842 of the functions to have void return type as long as the return
4843 value of the other is passed in a register. */
4844 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
4845 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
4846 cfun->decl, false);
4847 if (STACK_REG_P (a) || STACK_REG_P (b))
4849 if (!rtx_equal_p (a, b))
4850 return false;
4852 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
4854 else if (!rtx_equal_p (a, b))
4855 return false;
4857 if (TARGET_64BIT)
4859 /* The SYSV ABI has more call-clobbered registers;
4860 disallow sibcalls from MS to SYSV. */
4861 if (cfun->machine->call_abi == MS_ABI
4862 && ix86_function_type_abi (type) == SYSV_ABI)
4863 return false;
4865 else
4867 /* If this call is indirect, we'll need to be able to use a
4868 call-clobbered register for the address of the target function.
4869 Make sure that all such registers are not used for passing
4870 parameters. Note that DLLIMPORT functions are indirect. */
4871 if (!decl
4872 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
4874 if (ix86_function_regparm (type, NULL) >= 3)
4876 /* ??? Need to count the actual number of registers to be used,
4877 not the possible number of registers. Fix later. */
4878 return false;
4883 /* Otherwise okay. That also includes certain types of indirect calls. */
4884 return true;
4887 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
4888 and "sseregparm" calling convention attributes;
4889 arguments as in struct attribute_spec.handler. */
4891 static tree
4892 ix86_handle_cconv_attribute (tree *node, tree name,
4893 tree args,
4894 int flags ATTRIBUTE_UNUSED,
4895 bool *no_add_attrs)
4897 if (TREE_CODE (*node) != FUNCTION_TYPE
4898 && TREE_CODE (*node) != METHOD_TYPE
4899 && TREE_CODE (*node) != FIELD_DECL
4900 && TREE_CODE (*node) != TYPE_DECL)
4902 warning (OPT_Wattributes, "%qE attribute only applies to functions",
4903 name);
4904 *no_add_attrs = true;
4905 return NULL_TREE;
4908 /* Can combine regparm with all attributes but fastcall, and thiscall. */
4909 if (is_attribute_p ("regparm", name))
4911 tree cst;
4913 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4915 error ("fastcall and regparm attributes are not compatible");
4918 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4920 error ("regparam and thiscall attributes are not compatible");
4923 cst = TREE_VALUE (args);
4924 if (TREE_CODE (cst) != INTEGER_CST)
4926 warning (OPT_Wattributes,
4927 "%qE attribute requires an integer constant argument",
4928 name);
4929 *no_add_attrs = true;
4931 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
4933 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
4934 name, REGPARM_MAX);
4935 *no_add_attrs = true;
4938 return NULL_TREE;
4941 if (TARGET_64BIT)
4943 /* Do not warn when emulating the MS ABI. */
4944 if ((TREE_CODE (*node) != FUNCTION_TYPE
4945 && TREE_CODE (*node) != METHOD_TYPE)
4946 || ix86_function_type_abi (*node) != MS_ABI)
4947 warning (OPT_Wattributes, "%qE attribute ignored",
4948 name);
4949 *no_add_attrs = true;
4950 return NULL_TREE;
4953 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
4954 if (is_attribute_p ("fastcall", name))
4956 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4958 error ("fastcall and cdecl attributes are not compatible");
4960 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
4962 error ("fastcall and stdcall attributes are not compatible");
4964 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
4966 error ("fastcall and regparm attributes are not compatible");
4968 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4970 error ("fastcall and thiscall attributes are not compatible");
4974 /* Can combine stdcall with fastcall (redundant), regparm and
4975 sseregparm. */
4976 else if (is_attribute_p ("stdcall", name))
4978 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4980 error ("stdcall and cdecl attributes are not compatible");
4982 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4984 error ("stdcall and fastcall attributes are not compatible");
4986 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4988 error ("stdcall and thiscall attributes are not compatible");
4992 /* Can combine cdecl with regparm and sseregparm. */
4993 else if (is_attribute_p ("cdecl", name))
4995 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
4997 error ("stdcall and cdecl attributes are not compatible");
4999 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5001 error ("fastcall and cdecl attributes are not compatible");
5003 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5005 error ("cdecl and thiscall attributes are not compatible");
5008 else if (is_attribute_p ("thiscall", name))
5010 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
5011 warning (OPT_Wattributes, "%qE attribute is used for none class-method",
5012 name);
5013 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5015 error ("stdcall and thiscall attributes are not compatible");
5017 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5019 error ("fastcall and thiscall attributes are not compatible");
5021 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5023 error ("cdecl and thiscall attributes are not compatible");
5027 /* Can combine sseregparm with all attributes. */
5029 return NULL_TREE;
5032 /* The transactional memory builtins are implicitly regparm or fastcall
5033 depending on the ABI. Override the generic do-nothing attribute that
5034 these builtins were declared with, and replace it with one of the two
5035 attributes that we expect elsewhere. */
5037 static tree
5038 ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
5039 tree args ATTRIBUTE_UNUSED,
5040 int flags ATTRIBUTE_UNUSED,
5041 bool *no_add_attrs)
5043 tree alt;
5045 /* In no case do we want to add the placeholder attribute. */
5046 *no_add_attrs = true;
5048 /* The 64-bit ABI is unchanged for transactional memory. */
5049 if (TARGET_64BIT)
5050 return NULL_TREE;
5052 /* ??? Is there a better way to validate 32-bit windows? We have
5053 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
5054 if (CHECK_STACK_LIMIT > 0)
5055 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
5056 else
5058 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
5059 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
5061 decl_attributes (node, alt, flags);
5063 return NULL_TREE;
5066 /* This function determines from TYPE the calling-convention. */
5068 unsigned int
5069 ix86_get_callcvt (const_tree type)
5071 unsigned int ret = 0;
5072 bool is_stdarg;
5073 tree attrs;
5075 if (TARGET_64BIT)
5076 return IX86_CALLCVT_CDECL;
5078 attrs = TYPE_ATTRIBUTES (type);
5079 if (attrs != NULL_TREE)
5081 if (lookup_attribute ("cdecl", attrs))
5082 ret |= IX86_CALLCVT_CDECL;
5083 else if (lookup_attribute ("stdcall", attrs))
5084 ret |= IX86_CALLCVT_STDCALL;
5085 else if (lookup_attribute ("fastcall", attrs))
5086 ret |= IX86_CALLCVT_FASTCALL;
5087 else if (lookup_attribute ("thiscall", attrs))
5088 ret |= IX86_CALLCVT_THISCALL;
5090 /* Regparam isn't allowed for thiscall and fastcall. */
5091 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
5093 if (lookup_attribute ("regparm", attrs))
5094 ret |= IX86_CALLCVT_REGPARM;
5095 if (lookup_attribute ("sseregparm", attrs))
5096 ret |= IX86_CALLCVT_SSEREGPARM;
5099 if (IX86_BASE_CALLCVT(ret) != 0)
5100 return ret;
5103 is_stdarg = stdarg_p (type);
5104 if (TARGET_RTD && !is_stdarg)
5105 return IX86_CALLCVT_STDCALL | ret;
5107 if (ret != 0
5108 || is_stdarg
5109 || TREE_CODE (type) != METHOD_TYPE
5110 || ix86_function_type_abi (type) != MS_ABI)
5111 return IX86_CALLCVT_CDECL | ret;
5113 return IX86_CALLCVT_THISCALL;
5116 /* Return 0 if the attributes for two types are incompatible, 1 if they
5117 are compatible, and 2 if they are nearly compatible (which causes a
5118 warning to be generated). */
5120 static int
5121 ix86_comp_type_attributes (const_tree type1, const_tree type2)
5123 unsigned int ccvt1, ccvt2;
5125 if (TREE_CODE (type1) != FUNCTION_TYPE
5126 && TREE_CODE (type1) != METHOD_TYPE)
5127 return 1;
5129 ccvt1 = ix86_get_callcvt (type1);
5130 ccvt2 = ix86_get_callcvt (type2);
5131 if (ccvt1 != ccvt2)
5132 return 0;
5133 if (ix86_function_regparm (type1, NULL)
5134 != ix86_function_regparm (type2, NULL))
5135 return 0;
5137 return 1;
5140 /* Return the regparm value for a function with the indicated TYPE and DECL.
5141 DECL may be NULL when calling function indirectly
5142 or considering a libcall. */
5144 static int
5145 ix86_function_regparm (const_tree type, const_tree decl)
5147 tree attr;
5148 int regparm;
5149 unsigned int ccvt;
5151 if (TARGET_64BIT)
5152 return (ix86_function_type_abi (type) == SYSV_ABI
5153 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5154 ccvt = ix86_get_callcvt (type);
5155 regparm = ix86_regparm;
5157 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5159 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5160 if (attr)
5162 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
5163 return regparm;
5166 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5167 return 2;
5168 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5169 return 1;
5171 /* Use register calling convention for local functions when possible. */
5172 if (decl
5173 && TREE_CODE (decl) == FUNCTION_DECL
5174 && optimize
5175 && !(profile_flag && !flag_fentry))
5177 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5178 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5179 if (i && i->local && i->can_change_signature)
5181 int local_regparm, globals = 0, regno;
5183 /* Make sure no regparm register is taken by a
5184 fixed register variable. */
5185 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5186 if (fixed_regs[local_regparm])
5187 break;
5189 /* We don't want to use regparm(3) for nested functions as
5190 these use a static chain pointer in the third argument. */
5191 if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5192 local_regparm = 2;
5194 /* In 32-bit mode save a register for the split stack. */
5195 if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5196 local_regparm = 2;
5198 /* Each fixed register usage increases register pressure,
5199 so less registers should be used for argument passing.
5200 This functionality can be overriden by an explicit
5201 regparm value. */
5202 for (regno = AX_REG; regno <= DI_REG; regno++)
5203 if (fixed_regs[regno])
5204 globals++;
5206 local_regparm
5207 = globals < local_regparm ? local_regparm - globals : 0;
5209 if (local_regparm > regparm)
5210 regparm = local_regparm;
5214 return regparm;
5217 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5218 DFmode (2) arguments in SSE registers for a function with the
5219 indicated TYPE and DECL. DECL may be NULL when calling function
5220 indirectly or considering a libcall. Otherwise return 0. */
5222 static int
5223 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5225 gcc_assert (!TARGET_64BIT);
5227 /* Use SSE registers to pass SFmode and DFmode arguments if requested
5228 by the sseregparm attribute. */
5229 if (TARGET_SSEREGPARM
5230 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5232 if (!TARGET_SSE)
5234 if (warn)
5236 if (decl)
5237 error ("calling %qD with attribute sseregparm without "
5238 "SSE/SSE2 enabled", decl);
5239 else
5240 error ("calling %qT with attribute sseregparm without "
5241 "SSE/SSE2 enabled", type);
5243 return 0;
5246 return 2;
5249 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5250 (and DFmode for SSE2) arguments in SSE registers. */
5251 if (decl && TARGET_SSE_MATH && optimize
5252 && !(profile_flag && !flag_fentry))
5254 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5255 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5256 if (i && i->local && i->can_change_signature)
5257 return TARGET_SSE2 ? 2 : 1;
5260 return 0;
5263 /* Return true if EAX is live at the start of the function. Used by
5264 ix86_expand_prologue to determine if we need special help before
5265 calling allocate_stack_worker. */
5267 static bool
5268 ix86_eax_live_at_start_p (void)
5270 /* Cheat. Don't bother working forward from ix86_function_regparm
5271 to the function type to whether an actual argument is located in
5272 eax. Instead just look at cfg info, which is still close enough
5273 to correct at this point. This gives false positives for broken
5274 functions that might use uninitialized data that happens to be
5275 allocated in eax, but who cares? */
5276 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
5279 static bool
5280 ix86_keep_aggregate_return_pointer (tree fntype)
5282 tree attr;
5284 if (!TARGET_64BIT)
5286 attr = lookup_attribute ("callee_pop_aggregate_return",
5287 TYPE_ATTRIBUTES (fntype));
5288 if (attr)
5289 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5291 /* For 32-bit MS-ABI the default is to keep aggregate
5292 return pointer. */
5293 if (ix86_function_type_abi (fntype) == MS_ABI)
5294 return true;
5296 return KEEP_AGGREGATE_RETURN_POINTER != 0;
5299 /* Value is the number of bytes of arguments automatically
5300 popped when returning from a subroutine call.
5301 FUNDECL is the declaration node of the function (as a tree),
5302 FUNTYPE is the data type of the function (as a tree),
5303 or for a library call it is an identifier node for the subroutine name.
5304 SIZE is the number of bytes of arguments passed on the stack.
5306 On the 80386, the RTD insn may be used to pop them if the number
5307 of args is fixed, but if the number is variable then the caller
5308 must pop them all. RTD can't be used for library calls now
5309 because the library is compiled with the Unix compiler.
5310 Use of RTD is a selectable option, since it is incompatible with
5311 standard Unix calling sequences. If the option is not selected,
5312 the caller must always pop the args.
5314 The attribute stdcall is equivalent to RTD on a per module basis. */
5316 static int
5317 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5319 unsigned int ccvt;
5321 /* None of the 64-bit ABIs pop arguments. */
5322 if (TARGET_64BIT)
5323 return 0;
5325 ccvt = ix86_get_callcvt (funtype);
5327 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5328 | IX86_CALLCVT_THISCALL)) != 0
5329 && ! stdarg_p (funtype))
5330 return size;
5332 /* Lose any fake structure return argument if it is passed on the stack. */
5333 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5334 && !ix86_keep_aggregate_return_pointer (funtype))
5336 int nregs = ix86_function_regparm (funtype, fundecl);
5337 if (nregs == 0)
5338 return GET_MODE_SIZE (Pmode);
5341 return 0;
5344 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
5346 static bool
5347 ix86_legitimate_combined_insn (rtx insn)
5349 /* Check operand constraints in case hard registers were propagated
5350 into insn pattern. This check prevents combine pass from
5351 generating insn patterns with invalid hard register operands.
5352 These invalid insns can eventually confuse reload to error out
5353 with a spill failure. See also PRs 46829 and 46843. */
5354 if ((INSN_CODE (insn) = recog (PATTERN (insn), insn, 0)) >= 0)
5356 int i;
5358 extract_insn (insn);
5359 preprocess_constraints ();
5361 for (i = 0; i < recog_data.n_operands; i++)
5363 rtx op = recog_data.operand[i];
5364 enum machine_mode mode = GET_MODE (op);
5365 struct operand_alternative *op_alt;
5366 int offset = 0;
5367 bool win;
5368 int j;
5370 /* A unary operator may be accepted by the predicate, but it
5371 is irrelevant for matching constraints. */
5372 if (UNARY_P (op))
5373 op = XEXP (op, 0);
5375 if (GET_CODE (op) == SUBREG)
5377 if (REG_P (SUBREG_REG (op))
5378 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
5379 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
5380 GET_MODE (SUBREG_REG (op)),
5381 SUBREG_BYTE (op),
5382 GET_MODE (op));
5383 op = SUBREG_REG (op);
5386 if (!(REG_P (op) && HARD_REGISTER_P (op)))
5387 continue;
5389 op_alt = recog_op_alt[i];
5391 /* Operand has no constraints, anything is OK. */
5392 win = !recog_data.n_alternatives;
5394 for (j = 0; j < recog_data.n_alternatives; j++)
5396 if (op_alt[j].anything_ok
5397 || (op_alt[j].matches != -1
5398 && operands_match_p
5399 (recog_data.operand[i],
5400 recog_data.operand[op_alt[j].matches]))
5401 || reg_fits_class_p (op, op_alt[j].cl, offset, mode))
5403 win = true;
5404 break;
5408 if (!win)
5409 return false;
5413 return true;
5416 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
5418 static unsigned HOST_WIDE_INT
5419 ix86_asan_shadow_offset (void)
5421 return (unsigned HOST_WIDE_INT) 1 << (TARGET_LP64 ? 44 : 29);
5424 /* Argument support functions. */
5426 /* Return true when register may be used to pass function parameters. */
5427 bool
5428 ix86_function_arg_regno_p (int regno)
5430 int i;
5431 const int *parm_regs;
5433 if (!TARGET_64BIT)
5435 if (TARGET_MACHO)
5436 return (regno < REGPARM_MAX
5437 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5438 else
5439 return (regno < REGPARM_MAX
5440 || (TARGET_MMX && MMX_REGNO_P (regno)
5441 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5442 || (TARGET_SSE && SSE_REGNO_P (regno)
5443 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5446 if (TARGET_MACHO)
5448 if (SSE_REGNO_P (regno) && TARGET_SSE)
5449 return true;
5451 else
5453 if (TARGET_SSE && SSE_REGNO_P (regno)
5454 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5455 return true;
5458 /* TODO: The function should depend on current function ABI but
5459 builtins.c would need updating then. Therefore we use the
5460 default ABI. */
5462 /* RAX is used as hidden argument to va_arg functions. */
5463 if (ix86_abi == SYSV_ABI && regno == AX_REG)
5464 return true;
5466 if (ix86_abi == MS_ABI)
5467 parm_regs = x86_64_ms_abi_int_parameter_registers;
5468 else
5469 parm_regs = x86_64_int_parameter_registers;
5470 for (i = 0; i < (ix86_abi == MS_ABI
5471 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5472 if (regno == parm_regs[i])
5473 return true;
5474 return false;
5477 /* Return if we do not know how to pass TYPE solely in registers. */
5479 static bool
5480 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5482 if (must_pass_in_stack_var_size_or_pad (mode, type))
5483 return true;
5485 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
5486 The layout_type routine is crafty and tries to trick us into passing
5487 currently unsupported vector types on the stack by using TImode. */
5488 return (!TARGET_64BIT && mode == TImode
5489 && type && TREE_CODE (type) != VECTOR_TYPE);
5492 /* It returns the size, in bytes, of the area reserved for arguments passed
5493 in registers for the function represented by fndecl dependent to the used
5494 abi format. */
5496 ix86_reg_parm_stack_space (const_tree fndecl)
5498 enum calling_abi call_abi = SYSV_ABI;
5499 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5500 call_abi = ix86_function_abi (fndecl);
5501 else
5502 call_abi = ix86_function_type_abi (fndecl);
5503 if (TARGET_64BIT && call_abi == MS_ABI)
5504 return 32;
5505 return 0;
5508 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5509 call abi used. */
5510 enum calling_abi
5511 ix86_function_type_abi (const_tree fntype)
5513 if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5515 enum calling_abi abi = ix86_abi;
5516 if (abi == SYSV_ABI)
5518 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5519 abi = MS_ABI;
5521 else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5522 abi = SYSV_ABI;
5523 return abi;
5525 return ix86_abi;
5528 static bool
5529 ix86_function_ms_hook_prologue (const_tree fn)
5531 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5533 if (decl_function_context (fn) != NULL_TREE)
5534 error_at (DECL_SOURCE_LOCATION (fn),
5535 "ms_hook_prologue is not compatible with nested function");
5536 else
5537 return true;
5539 return false;
5542 static enum calling_abi
5543 ix86_function_abi (const_tree fndecl)
5545 if (! fndecl)
5546 return ix86_abi;
5547 return ix86_function_type_abi (TREE_TYPE (fndecl));
5550 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5551 call abi used. */
5552 enum calling_abi
5553 ix86_cfun_abi (void)
5555 if (! cfun)
5556 return ix86_abi;
5557 return cfun->machine->call_abi;
5560 /* Write the extra assembler code needed to declare a function properly. */
5562 void
5563 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5564 tree decl)
5566 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5568 if (is_ms_hook)
5570 int i, filler_count = (TARGET_64BIT ? 32 : 16);
5571 unsigned int filler_cc = 0xcccccccc;
5573 for (i = 0; i < filler_count; i += 4)
5574 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5577 #ifdef SUBTARGET_ASM_UNWIND_INIT
5578 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5579 #endif
5581 ASM_OUTPUT_LABEL (asm_out_file, fname);
5583 /* Output magic byte marker, if hot-patch attribute is set. */
5584 if (is_ms_hook)
5586 if (TARGET_64BIT)
5588 /* leaq [%rsp + 0], %rsp */
5589 asm_fprintf (asm_out_file, ASM_BYTE
5590 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5592 else
5594 /* movl.s %edi, %edi
5595 push %ebp
5596 movl.s %esp, %ebp */
5597 asm_fprintf (asm_out_file, ASM_BYTE
5598 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5603 /* regclass.c */
5604 extern void init_regs (void);
5606 /* Implementation of call abi switching target hook. Specific to FNDECL
5607 the specific call register sets are set. See also
5608 ix86_conditional_register_usage for more details. */
5609 void
5610 ix86_call_abi_override (const_tree fndecl)
5612 if (fndecl == NULL_TREE)
5613 cfun->machine->call_abi = ix86_abi;
5614 else
5615 cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
5618 /* 64-bit MS and SYSV ABI have different set of call used registers. Avoid
5619 expensive re-initialization of init_regs each time we switch function context
5620 since this is needed only during RTL expansion. */
5621 static void
5622 ix86_maybe_switch_abi (void)
5624 if (TARGET_64BIT &&
5625 call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
5626 reinit_regs ();
5629 /* Initialize a variable CUM of type CUMULATIVE_ARGS
5630 for a call to a function whose data type is FNTYPE.
5631 For a library call, FNTYPE is 0. */
5633 void
5634 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
5635 tree fntype, /* tree ptr for function decl */
5636 rtx libname, /* SYMBOL_REF of library name or 0 */
5637 tree fndecl,
5638 int caller)
5640 struct cgraph_local_info *i;
5642 memset (cum, 0, sizeof (*cum));
5644 if (fndecl)
5646 i = cgraph_local_info (fndecl);
5647 cum->call_abi = ix86_function_abi (fndecl);
5649 else
5651 i = NULL;
5652 cum->call_abi = ix86_function_type_abi (fntype);
5655 cum->caller = caller;
5657 /* Set up the number of registers to use for passing arguments. */
5659 if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
5660 sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
5661 "or subtarget optimization implying it");
5662 cum->nregs = ix86_regparm;
5663 if (TARGET_64BIT)
5665 cum->nregs = (cum->call_abi == SYSV_ABI
5666 ? X86_64_REGPARM_MAX
5667 : X86_64_MS_REGPARM_MAX);
5669 if (TARGET_SSE)
5671 cum->sse_nregs = SSE_REGPARM_MAX;
5672 if (TARGET_64BIT)
5674 cum->sse_nregs = (cum->call_abi == SYSV_ABI
5675 ? X86_64_SSE_REGPARM_MAX
5676 : X86_64_MS_SSE_REGPARM_MAX);
5679 if (TARGET_MMX)
5680 cum->mmx_nregs = MMX_REGPARM_MAX;
5681 cum->warn_avx = true;
5682 cum->warn_sse = true;
5683 cum->warn_mmx = true;
5685 /* Because type might mismatch in between caller and callee, we need to
5686 use actual type of function for local calls.
5687 FIXME: cgraph_analyze can be told to actually record if function uses
5688 va_start so for local functions maybe_vaarg can be made aggressive
5689 helping K&R code.
5690 FIXME: once typesytem is fixed, we won't need this code anymore. */
5691 if (i && i->local && i->can_change_signature)
5692 fntype = TREE_TYPE (fndecl);
5693 cum->maybe_vaarg = (fntype
5694 ? (!prototype_p (fntype) || stdarg_p (fntype))
5695 : !libname);
5697 if (!TARGET_64BIT)
5699 /* If there are variable arguments, then we won't pass anything
5700 in registers in 32-bit mode. */
5701 if (stdarg_p (fntype))
5703 cum->nregs = 0;
5704 cum->sse_nregs = 0;
5705 cum->mmx_nregs = 0;
5706 cum->warn_avx = 0;
5707 cum->warn_sse = 0;
5708 cum->warn_mmx = 0;
5709 return;
5712 /* Use ecx and edx registers if function has fastcall attribute,
5713 else look for regparm information. */
5714 if (fntype)
5716 unsigned int ccvt = ix86_get_callcvt (fntype);
5717 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5719 cum->nregs = 1;
5720 cum->fastcall = 1; /* Same first register as in fastcall. */
5722 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5724 cum->nregs = 2;
5725 cum->fastcall = 1;
5727 else
5728 cum->nregs = ix86_function_regparm (fntype, fndecl);
5731 /* Set up the number of SSE registers used for passing SFmode
5732 and DFmode arguments. Warn for mismatching ABI. */
5733 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
5737 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
5738 But in the case of vector types, it is some vector mode.
5740 When we have only some of our vector isa extensions enabled, then there
5741 are some modes for which vector_mode_supported_p is false. For these
5742 modes, the generic vector support in gcc will choose some non-vector mode
5743 in order to implement the type. By computing the natural mode, we'll
5744 select the proper ABI location for the operand and not depend on whatever
5745 the middle-end decides to do with these vector types.
5747 The midde-end can't deal with the vector types > 16 bytes. In this
5748 case, we return the original mode and warn ABI change if CUM isn't
5749 NULL. */
5751 static enum machine_mode
5752 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum)
5754 enum machine_mode mode = TYPE_MODE (type);
5756 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
5758 HOST_WIDE_INT size = int_size_in_bytes (type);
5759 if ((size == 8 || size == 16 || size == 32)
5760 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
5761 && TYPE_VECTOR_SUBPARTS (type) > 1)
5763 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
5765 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
5766 mode = MIN_MODE_VECTOR_FLOAT;
5767 else
5768 mode = MIN_MODE_VECTOR_INT;
5770 /* Get the mode which has this inner mode and number of units. */
5771 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
5772 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
5773 && GET_MODE_INNER (mode) == innermode)
5775 if (size == 32 && !TARGET_AVX)
5777 static bool warnedavx;
5779 if (cum
5780 && !warnedavx
5781 && cum->warn_avx)
5783 warnedavx = true;
5784 warning (0, "AVX vector argument without AVX "
5785 "enabled changes the ABI");
5787 return TYPE_MODE (type);
5789 else if ((size == 8 || size == 16) && !TARGET_SSE)
5791 static bool warnedsse;
5793 if (cum
5794 && !warnedsse
5795 && cum->warn_sse)
5797 warnedsse = true;
5798 warning (0, "SSE vector argument without SSE "
5799 "enabled changes the ABI");
5801 return mode;
5803 else
5804 return mode;
5807 gcc_unreachable ();
5811 return mode;
5814 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
5815 this may not agree with the mode that the type system has chosen for the
5816 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
5817 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
5819 static rtx
5820 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
5821 unsigned int regno)
5823 rtx tmp;
5825 if (orig_mode != BLKmode)
5826 tmp = gen_rtx_REG (orig_mode, regno);
5827 else
5829 tmp = gen_rtx_REG (mode, regno);
5830 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
5831 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
5834 return tmp;
5837 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
5838 of this code is to classify each 8bytes of incoming argument by the register
5839 class and assign registers accordingly. */
5841 /* Return the union class of CLASS1 and CLASS2.
5842 See the x86-64 PS ABI for details. */
5844 static enum x86_64_reg_class
5845 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
5847 /* Rule #1: If both classes are equal, this is the resulting class. */
5848 if (class1 == class2)
5849 return class1;
5851 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
5852 the other class. */
5853 if (class1 == X86_64_NO_CLASS)
5854 return class2;
5855 if (class2 == X86_64_NO_CLASS)
5856 return class1;
5858 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
5859 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
5860 return X86_64_MEMORY_CLASS;
5862 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
5863 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
5864 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
5865 return X86_64_INTEGERSI_CLASS;
5866 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
5867 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
5868 return X86_64_INTEGER_CLASS;
5870 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
5871 MEMORY is used. */
5872 if (class1 == X86_64_X87_CLASS
5873 || class1 == X86_64_X87UP_CLASS
5874 || class1 == X86_64_COMPLEX_X87_CLASS
5875 || class2 == X86_64_X87_CLASS
5876 || class2 == X86_64_X87UP_CLASS
5877 || class2 == X86_64_COMPLEX_X87_CLASS)
5878 return X86_64_MEMORY_CLASS;
5880 /* Rule #6: Otherwise class SSE is used. */
5881 return X86_64_SSE_CLASS;
5884 /* Classify the argument of type TYPE and mode MODE.
5885 CLASSES will be filled by the register class used to pass each word
5886 of the operand. The number of words is returned. In case the parameter
5887 should be passed in memory, 0 is returned. As a special case for zero
5888 sized containers, classes[0] will be NO_CLASS and 1 is returned.
5890 BIT_OFFSET is used internally for handling records and specifies offset
5891 of the offset in bits modulo 256 to avoid overflow cases.
5893 See the x86-64 PS ABI for details.
5896 static int
5897 classify_argument (enum machine_mode mode, const_tree type,
5898 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
5900 HOST_WIDE_INT bytes =
5901 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
5902 int words
5903 = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5905 /* Variable sized entities are always passed/returned in memory. */
5906 if (bytes < 0)
5907 return 0;
5909 if (mode != VOIDmode
5910 && targetm.calls.must_pass_in_stack (mode, type))
5911 return 0;
5913 /* Special case check for pointer to shared, on 64-bit target. */
5914 if (TARGET_64BIT && mode == TImode
5915 && type && TREE_CODE (type) == POINTER_TYPE
5916 && upc_shared_type_p (TREE_TYPE (type)))
5918 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
5919 return 2;
5922 if (type && AGGREGATE_TYPE_P (type))
5924 int i;
5925 tree field;
5926 enum x86_64_reg_class subclasses[MAX_CLASSES];
5928 /* On x86-64 we pass structures larger than 32 bytes on the stack. */
5929 if (bytes > 32)
5930 return 0;
5932 for (i = 0; i < words; i++)
5933 classes[i] = X86_64_NO_CLASS;
5935 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
5936 signalize memory class, so handle it as special case. */
5937 if (!words)
5939 classes[0] = X86_64_NO_CLASS;
5940 return 1;
5943 /* Classify each field of record and merge classes. */
5944 switch (TREE_CODE (type))
5946 case RECORD_TYPE:
5947 /* And now merge the fields of structure. */
5948 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5950 if (TREE_CODE (field) == FIELD_DECL)
5952 int num;
5954 if (TREE_TYPE (field) == error_mark_node)
5955 continue;
5957 /* Bitfields are always classified as integer. Handle them
5958 early, since later code would consider them to be
5959 misaligned integers. */
5960 if (DECL_BIT_FIELD (field))
5962 for (i = (int_bit_position (field)
5963 + (bit_offset % 64)) / 8 / 8;
5964 i < ((int_bit_position (field) + (bit_offset % 64))
5965 + tree_low_cst (DECL_SIZE (field), 0)
5966 + 63) / 8 / 8; i++)
5967 classes[i] =
5968 merge_classes (X86_64_INTEGER_CLASS,
5969 classes[i]);
5971 else
5973 int pos;
5975 type = TREE_TYPE (field);
5977 /* Flexible array member is ignored. */
5978 if (TYPE_MODE (type) == BLKmode
5979 && TREE_CODE (type) == ARRAY_TYPE
5980 && TYPE_SIZE (type) == NULL_TREE
5981 && TYPE_DOMAIN (type) != NULL_TREE
5982 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
5983 == NULL_TREE))
5985 static bool warned;
5987 if (!warned && warn_psabi)
5989 warned = true;
5990 inform (input_location,
5991 "the ABI of passing struct with"
5992 " a flexible array member has"
5993 " changed in GCC 4.4");
5995 continue;
5997 num = classify_argument (TYPE_MODE (type), type,
5998 subclasses,
5999 (int_bit_position (field)
6000 + bit_offset) % 256);
6001 if (!num)
6002 return 0;
6003 pos = (int_bit_position (field)
6004 + (bit_offset % 64)) / 8 / 8;
6005 for (i = 0; i < num && (i + pos) < words; i++)
6006 classes[i + pos] =
6007 merge_classes (subclasses[i], classes[i + pos]);
6011 break;
6013 case ARRAY_TYPE:
6014 /* Arrays are handled as small records. */
6016 int num;
6017 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
6018 TREE_TYPE (type), subclasses, bit_offset);
6019 if (!num)
6020 return 0;
6022 /* The partial classes are now full classes. */
6023 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
6024 subclasses[0] = X86_64_SSE_CLASS;
6025 if (subclasses[0] == X86_64_INTEGERSI_CLASS
6026 && !((bit_offset % 64) == 0 && bytes == 4))
6027 subclasses[0] = X86_64_INTEGER_CLASS;
6029 for (i = 0; i < words; i++)
6030 classes[i] = subclasses[i % num];
6032 break;
6034 case UNION_TYPE:
6035 case QUAL_UNION_TYPE:
6036 /* Unions are similar to RECORD_TYPE but offset is always 0.
6038 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6040 if (TREE_CODE (field) == FIELD_DECL)
6042 int num;
6044 if (TREE_TYPE (field) == error_mark_node)
6045 continue;
6047 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
6048 TREE_TYPE (field), subclasses,
6049 bit_offset);
6050 if (!num)
6051 return 0;
6052 for (i = 0; i < num; i++)
6053 classes[i] = merge_classes (subclasses[i], classes[i]);
6056 break;
6058 default:
6059 gcc_unreachable ();
6062 if (words > 2)
6064 /* When size > 16 bytes, if the first one isn't
6065 X86_64_SSE_CLASS or any other ones aren't
6066 X86_64_SSEUP_CLASS, everything should be passed in
6067 memory. */
6068 if (classes[0] != X86_64_SSE_CLASS)
6069 return 0;
6071 for (i = 1; i < words; i++)
6072 if (classes[i] != X86_64_SSEUP_CLASS)
6073 return 0;
6076 /* Final merger cleanup. */
6077 for (i = 0; i < words; i++)
6079 /* If one class is MEMORY, everything should be passed in
6080 memory. */
6081 if (classes[i] == X86_64_MEMORY_CLASS)
6082 return 0;
6084 /* The X86_64_SSEUP_CLASS should be always preceded by
6085 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
6086 if (classes[i] == X86_64_SSEUP_CLASS
6087 && classes[i - 1] != X86_64_SSE_CLASS
6088 && classes[i - 1] != X86_64_SSEUP_CLASS)
6090 /* The first one should never be X86_64_SSEUP_CLASS. */
6091 gcc_assert (i != 0);
6092 classes[i] = X86_64_SSE_CLASS;
6095 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
6096 everything should be passed in memory. */
6097 if (classes[i] == X86_64_X87UP_CLASS
6098 && (classes[i - 1] != X86_64_X87_CLASS))
6100 static bool warned;
6102 /* The first one should never be X86_64_X87UP_CLASS. */
6103 gcc_assert (i != 0);
6104 if (!warned && warn_psabi)
6106 warned = true;
6107 inform (input_location,
6108 "the ABI of passing union with long double"
6109 " has changed in GCC 4.4");
6111 return 0;
6114 return words;
6117 /* Compute alignment needed. We align all types to natural boundaries with
6118 exception of XFmode that is aligned to 64bits. */
6119 if (mode != VOIDmode && mode != BLKmode)
6121 int mode_alignment = GET_MODE_BITSIZE (mode);
6123 if (mode == XFmode)
6124 mode_alignment = 128;
6125 else if (mode == XCmode)
6126 mode_alignment = 256;
6127 if (COMPLEX_MODE_P (mode))
6128 mode_alignment /= 2;
6129 /* Misaligned fields are always returned in memory. */
6130 if (bit_offset % mode_alignment)
6131 return 0;
6134 /* for V1xx modes, just use the base mode */
6135 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
6136 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
6137 mode = GET_MODE_INNER (mode);
6139 /* Classification of atomic types. */
6140 switch (mode)
6142 case SDmode:
6143 case DDmode:
6144 classes[0] = X86_64_SSE_CLASS;
6145 return 1;
6146 case TDmode:
6147 classes[0] = X86_64_SSE_CLASS;
6148 classes[1] = X86_64_SSEUP_CLASS;
6149 return 2;
6150 case DImode:
6151 case SImode:
6152 case HImode:
6153 case QImode:
6154 case CSImode:
6155 case CHImode:
6156 case CQImode:
6158 int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
6160 if (size <= 32)
6162 classes[0] = X86_64_INTEGERSI_CLASS;
6163 return 1;
6165 else if (size <= 64)
6167 classes[0] = X86_64_INTEGER_CLASS;
6168 return 1;
6170 else if (size <= 64+32)
6172 classes[0] = X86_64_INTEGER_CLASS;
6173 classes[1] = X86_64_INTEGERSI_CLASS;
6174 return 2;
6176 else if (size <= 64+64)
6178 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6179 return 2;
6181 else
6182 gcc_unreachable ();
6184 case CDImode:
6185 case TImode:
6186 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6187 return 2;
6188 case COImode:
6189 case OImode:
6190 /* OImode shouldn't be used directly. */
6191 gcc_unreachable ();
6192 case CTImode:
6193 return 0;
6194 case SFmode:
6195 if (!(bit_offset % 64))
6196 classes[0] = X86_64_SSESF_CLASS;
6197 else
6198 classes[0] = X86_64_SSE_CLASS;
6199 return 1;
6200 case DFmode:
6201 classes[0] = X86_64_SSEDF_CLASS;
6202 return 1;
6203 case XFmode:
6204 classes[0] = X86_64_X87_CLASS;
6205 classes[1] = X86_64_X87UP_CLASS;
6206 return 2;
6207 case TFmode:
6208 classes[0] = X86_64_SSE_CLASS;
6209 classes[1] = X86_64_SSEUP_CLASS;
6210 return 2;
6211 case SCmode:
6212 classes[0] = X86_64_SSE_CLASS;
6213 if (!(bit_offset % 64))
6214 return 1;
6215 else
6217 static bool warned;
6219 if (!warned && warn_psabi)
6221 warned = true;
6222 inform (input_location,
6223 "the ABI of passing structure with complex float"
6224 " member has changed in GCC 4.4");
6226 classes[1] = X86_64_SSESF_CLASS;
6227 return 2;
6229 case DCmode:
6230 classes[0] = X86_64_SSEDF_CLASS;
6231 classes[1] = X86_64_SSEDF_CLASS;
6232 return 2;
6233 case XCmode:
6234 classes[0] = X86_64_COMPLEX_X87_CLASS;
6235 return 1;
6236 case TCmode:
6237 /* This modes is larger than 16 bytes. */
6238 return 0;
6239 case V8SFmode:
6240 case V8SImode:
6241 case V32QImode:
6242 case V16HImode:
6243 case V4DFmode:
6244 case V4DImode:
6245 classes[0] = X86_64_SSE_CLASS;
6246 classes[1] = X86_64_SSEUP_CLASS;
6247 classes[2] = X86_64_SSEUP_CLASS;
6248 classes[3] = X86_64_SSEUP_CLASS;
6249 return 4;
6250 case V4SFmode:
6251 case V4SImode:
6252 case V16QImode:
6253 case V8HImode:
6254 case V2DFmode:
6255 case V2DImode:
6256 classes[0] = X86_64_SSE_CLASS;
6257 classes[1] = X86_64_SSEUP_CLASS;
6258 return 2;
6259 case V1TImode:
6260 case V1DImode:
6261 case V2SFmode:
6262 case V2SImode:
6263 case V4HImode:
6264 case V8QImode:
6265 classes[0] = X86_64_SSE_CLASS;
6266 return 1;
6267 case BLKmode:
6268 case VOIDmode:
6269 return 0;
6270 default:
6271 gcc_assert (VECTOR_MODE_P (mode));
6273 if (bytes > 16)
6274 return 0;
6276 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6278 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6279 classes[0] = X86_64_INTEGERSI_CLASS;
6280 else
6281 classes[0] = X86_64_INTEGER_CLASS;
6282 classes[1] = X86_64_INTEGER_CLASS;
6283 return 1 + (bytes > 8);
6287 /* Examine the argument and return set number of register required in each
6288 class. Return 0 iff parameter should be passed in memory. */
6289 static int
6290 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6291 int *int_nregs, int *sse_nregs)
6293 enum x86_64_reg_class regclass[MAX_CLASSES];
6294 int n = classify_argument (mode, type, regclass, 0);
6296 *int_nregs = 0;
6297 *sse_nregs = 0;
6298 if (!n)
6299 return 0;
6300 for (n--; n >= 0; n--)
6301 switch (regclass[n])
6303 case X86_64_INTEGER_CLASS:
6304 case X86_64_INTEGERSI_CLASS:
6305 (*int_nregs)++;
6306 break;
6307 case X86_64_SSE_CLASS:
6308 case X86_64_SSESF_CLASS:
6309 case X86_64_SSEDF_CLASS:
6310 (*sse_nregs)++;
6311 break;
6312 case X86_64_NO_CLASS:
6313 case X86_64_SSEUP_CLASS:
6314 break;
6315 case X86_64_X87_CLASS:
6316 case X86_64_X87UP_CLASS:
6317 if (!in_return)
6318 return 0;
6319 break;
6320 case X86_64_COMPLEX_X87_CLASS:
6321 return in_return ? 2 : 0;
6322 case X86_64_MEMORY_CLASS:
6323 gcc_unreachable ();
6325 return 1;
6328 /* Construct container for the argument used by GCC interface. See
6329 FUNCTION_ARG for the detailed description. */
6331 static rtx
6332 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6333 const_tree type, int in_return, int nintregs, int nsseregs,
6334 const int *intreg, int sse_regno)
6336 /* The following variables hold the static issued_error state. */
6337 static bool issued_sse_arg_error;
6338 static bool issued_sse_ret_error;
6339 static bool issued_x87_ret_error;
6341 enum machine_mode tmpmode;
6342 int bytes =
6343 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6344 enum x86_64_reg_class regclass[MAX_CLASSES];
6345 int n;
6346 int i;
6347 int nexps = 0;
6348 int needed_sseregs, needed_intregs;
6349 rtx exp[MAX_CLASSES];
6350 rtx ret;
6352 n = classify_argument (mode, type, regclass, 0);
6353 if (!n)
6354 return NULL;
6355 if (!examine_argument (mode, type, in_return, &needed_intregs,
6356 &needed_sseregs))
6357 return NULL;
6358 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6359 return NULL;
6361 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
6362 some less clueful developer tries to use floating-point anyway. */
6363 if (needed_sseregs && !TARGET_SSE)
6365 if (in_return)
6367 if (!issued_sse_ret_error)
6369 error ("SSE register return with SSE disabled");
6370 issued_sse_ret_error = true;
6373 else if (!issued_sse_arg_error)
6375 error ("SSE register argument with SSE disabled");
6376 issued_sse_arg_error = true;
6378 return NULL;
6381 /* Likewise, error if the ABI requires us to return values in the
6382 x87 registers and the user specified -mno-80387. */
6383 if (!TARGET_80387 && in_return)
6384 for (i = 0; i < n; i++)
6385 if (regclass[i] == X86_64_X87_CLASS
6386 || regclass[i] == X86_64_X87UP_CLASS
6387 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6389 if (!issued_x87_ret_error)
6391 error ("x87 register return with x87 disabled");
6392 issued_x87_ret_error = true;
6394 return NULL;
6397 /* First construct simple cases. Avoid SCmode, since we want to use
6398 single register to pass this type. */
6399 if (n == 1 && mode != SCmode)
6400 switch (regclass[0])
6402 case X86_64_INTEGER_CLASS:
6403 case X86_64_INTEGERSI_CLASS:
6404 return gen_rtx_REG (mode, intreg[0]);
6405 case X86_64_SSE_CLASS:
6406 case X86_64_SSESF_CLASS:
6407 case X86_64_SSEDF_CLASS:
6408 if (mode != BLKmode)
6409 return gen_reg_or_parallel (mode, orig_mode,
6410 SSE_REGNO (sse_regno));
6411 break;
6412 case X86_64_X87_CLASS:
6413 case X86_64_COMPLEX_X87_CLASS:
6414 return gen_rtx_REG (mode, FIRST_STACK_REG);
6415 case X86_64_NO_CLASS:
6416 /* Zero sized array, struct or class. */
6417 return NULL;
6418 default:
6419 gcc_unreachable ();
6421 if (n == 2
6422 && regclass[0] == X86_64_SSE_CLASS
6423 && regclass[1] == X86_64_SSEUP_CLASS
6424 && mode != BLKmode)
6425 return gen_reg_or_parallel (mode, orig_mode,
6426 SSE_REGNO (sse_regno));
6427 if (n == 4
6428 && regclass[0] == X86_64_SSE_CLASS
6429 && regclass[1] == X86_64_SSEUP_CLASS
6430 && regclass[2] == X86_64_SSEUP_CLASS
6431 && regclass[3] == X86_64_SSEUP_CLASS
6432 && mode != BLKmode)
6433 return gen_reg_or_parallel (mode, orig_mode,
6434 SSE_REGNO (sse_regno));
6435 if (n == 2
6436 && regclass[0] == X86_64_X87_CLASS
6437 && regclass[1] == X86_64_X87UP_CLASS)
6438 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6440 if (n == 2
6441 && regclass[0] == X86_64_INTEGER_CLASS
6442 && regclass[1] == X86_64_INTEGER_CLASS
6443 && (mode == CDImode || mode == TImode || mode == TFmode)
6444 && intreg[0] + 1 == intreg[1])
6445 return gen_rtx_REG (mode, intreg[0]);
6447 /* Otherwise figure out the entries of the PARALLEL. */
6448 for (i = 0; i < n; i++)
6450 int pos;
6452 switch (regclass[i])
6454 case X86_64_NO_CLASS:
6455 break;
6456 case X86_64_INTEGER_CLASS:
6457 case X86_64_INTEGERSI_CLASS:
6458 /* Merge TImodes on aligned occasions here too. */
6459 if (i * 8 + 8 > bytes)
6460 tmpmode
6461 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6462 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6463 tmpmode = SImode;
6464 else
6465 tmpmode = DImode;
6466 /* We've requested 24 bytes we
6467 don't have mode for. Use DImode. */
6468 if (tmpmode == BLKmode)
6469 tmpmode = DImode;
6470 exp [nexps++]
6471 = gen_rtx_EXPR_LIST (VOIDmode,
6472 gen_rtx_REG (tmpmode, *intreg),
6473 GEN_INT (i*8));
6474 intreg++;
6475 break;
6476 case X86_64_SSESF_CLASS:
6477 exp [nexps++]
6478 = gen_rtx_EXPR_LIST (VOIDmode,
6479 gen_rtx_REG (SFmode,
6480 SSE_REGNO (sse_regno)),
6481 GEN_INT (i*8));
6482 sse_regno++;
6483 break;
6484 case X86_64_SSEDF_CLASS:
6485 exp [nexps++]
6486 = gen_rtx_EXPR_LIST (VOIDmode,
6487 gen_rtx_REG (DFmode,
6488 SSE_REGNO (sse_regno)),
6489 GEN_INT (i*8));
6490 sse_regno++;
6491 break;
6492 case X86_64_SSE_CLASS:
6493 pos = i;
6494 switch (n)
6496 case 1:
6497 tmpmode = DImode;
6498 break;
6499 case 2:
6500 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6502 tmpmode = TImode;
6503 i++;
6505 else
6506 tmpmode = DImode;
6507 break;
6508 case 4:
6509 gcc_assert (i == 0
6510 && regclass[1] == X86_64_SSEUP_CLASS
6511 && regclass[2] == X86_64_SSEUP_CLASS
6512 && regclass[3] == X86_64_SSEUP_CLASS);
6513 tmpmode = OImode;
6514 i += 3;
6515 break;
6516 default:
6517 gcc_unreachable ();
6519 exp [nexps++]
6520 = gen_rtx_EXPR_LIST (VOIDmode,
6521 gen_rtx_REG (tmpmode,
6522 SSE_REGNO (sse_regno)),
6523 GEN_INT (pos*8));
6524 sse_regno++;
6525 break;
6526 default:
6527 gcc_unreachable ();
6531 /* Empty aligned struct, union or class. */
6532 if (nexps == 0)
6533 return NULL;
6535 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6536 for (i = 0; i < nexps; i++)
6537 XVECEXP (ret, 0, i) = exp [i];
6538 return ret;
6541 /* Update the data in CUM to advance over an argument of mode MODE
6542 and data type TYPE. (TYPE is null for libcalls where that information
6543 may not be available.) */
6545 static void
6546 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6547 const_tree type, HOST_WIDE_INT bytes,
6548 HOST_WIDE_INT words)
6550 switch (mode)
6552 default:
6553 break;
6555 case BLKmode:
6556 if (bytes < 0)
6557 break;
6558 /* FALLTHRU */
6560 case DImode:
6561 case SImode:
6562 case HImode:
6563 case QImode:
6564 cum->words += words;
6565 cum->nregs -= words;
6566 cum->regno += words;
6568 if (cum->nregs <= 0)
6570 cum->nregs = 0;
6571 cum->regno = 0;
6573 break;
6575 case OImode:
6576 /* OImode shouldn't be used directly. */
6577 gcc_unreachable ();
6579 case DFmode:
6580 if (cum->float_in_sse < 2)
6581 break;
6582 case SFmode:
6583 if (cum->float_in_sse < 1)
6584 break;
6585 /* FALLTHRU */
6587 case V8SFmode:
6588 case V8SImode:
6589 case V32QImode:
6590 case V16HImode:
6591 case V4DFmode:
6592 case V4DImode:
6593 case TImode:
6594 case V16QImode:
6595 case V8HImode:
6596 case V4SImode:
6597 case V2DImode:
6598 case V4SFmode:
6599 case V2DFmode:
6600 if (!type || !AGGREGATE_TYPE_P (type))
6602 cum->sse_words += words;
6603 cum->sse_nregs -= 1;
6604 cum->sse_regno += 1;
6605 if (cum->sse_nregs <= 0)
6607 cum->sse_nregs = 0;
6608 cum->sse_regno = 0;
6611 break;
6613 case V8QImode:
6614 case V4HImode:
6615 case V2SImode:
6616 case V2SFmode:
6617 case V1TImode:
6618 case V1DImode:
6619 if (!type || !AGGREGATE_TYPE_P (type))
6621 cum->mmx_words += words;
6622 cum->mmx_nregs -= 1;
6623 cum->mmx_regno += 1;
6624 if (cum->mmx_nregs <= 0)
6626 cum->mmx_nregs = 0;
6627 cum->mmx_regno = 0;
6630 break;
6634 static void
6635 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6636 const_tree type, HOST_WIDE_INT words, bool named)
6638 int int_nregs, sse_nregs;
6640 /* Unnamed 256bit vector mode parameters are passed on stack. */
6641 if (!named && VALID_AVX256_REG_MODE (mode))
6642 return;
6644 if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
6645 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
6647 cum->nregs -= int_nregs;
6648 cum->sse_nregs -= sse_nregs;
6649 cum->regno += int_nregs;
6650 cum->sse_regno += sse_nregs;
6652 else
6654 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
6655 cum->words = (cum->words + align - 1) & ~(align - 1);
6656 cum->words += words;
6660 static void
6661 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
6662 HOST_WIDE_INT words)
6664 /* Otherwise, this should be passed indirect. */
6665 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
6667 cum->words += words;
6668 if (cum->nregs > 0)
6670 cum->nregs -= 1;
6671 cum->regno += 1;
6675 /* Update the data in CUM to advance over an argument of mode MODE and
6676 data type TYPE. (TYPE is null for libcalls where that information
6677 may not be available.) */
6679 static void
6680 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
6681 const_tree type, bool named)
6683 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6684 HOST_WIDE_INT bytes, words;
6686 if (mode == BLKmode)
6687 bytes = int_size_in_bytes (type);
6688 else
6689 bytes = GET_MODE_SIZE (mode);
6690 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6692 if (type)
6693 mode = type_natural_mode (type, NULL);
6695 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6696 function_arg_advance_ms_64 (cum, bytes, words);
6697 else if (TARGET_64BIT)
6698 function_arg_advance_64 (cum, mode, type, words, named);
6699 else
6700 function_arg_advance_32 (cum, mode, type, bytes, words);
6703 /* Define where to put the arguments to a function.
6704 Value is zero to push the argument on the stack,
6705 or a hard register in which to store the argument.
6707 MODE is the argument's machine mode.
6708 TYPE is the data type of the argument (as a tree).
6709 This is null for libcalls where that information may
6710 not be available.
6711 CUM is a variable of type CUMULATIVE_ARGS which gives info about
6712 the preceding args and about the function being called.
6713 NAMED is nonzero if this argument is a named parameter
6714 (otherwise it is an extra parameter matching an ellipsis). */
6716 static rtx
6717 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6718 enum machine_mode orig_mode, const_tree type,
6719 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
6721 static bool warnedsse, warnedmmx;
6723 /* Avoid the AL settings for the Unix64 ABI. */
6724 if (mode == VOIDmode)
6725 return constm1_rtx;
6727 switch (mode)
6729 default:
6730 break;
6732 case BLKmode:
6733 if (bytes < 0)
6734 break;
6735 /* FALLTHRU */
6736 case DImode:
6737 case SImode:
6738 case HImode:
6739 case QImode:
6740 if (words <= cum->nregs)
6742 int regno = cum->regno;
6744 /* Fastcall allocates the first two DWORD (SImode) or
6745 smaller arguments to ECX and EDX if it isn't an
6746 aggregate type . */
6747 if (cum->fastcall)
6749 if (mode == BLKmode
6750 || mode == DImode
6751 || (type && AGGREGATE_TYPE_P (type)))
6752 break;
6754 /* ECX not EAX is the first allocated register. */
6755 if (regno == AX_REG)
6756 regno = CX_REG;
6758 return gen_rtx_REG (mode, regno);
6760 break;
6762 case DFmode:
6763 if (cum->float_in_sse < 2)
6764 break;
6765 case SFmode:
6766 if (cum->float_in_sse < 1)
6767 break;
6768 /* FALLTHRU */
6769 case TImode:
6770 /* In 32bit, we pass TImode in xmm registers. */
6771 case V16QImode:
6772 case V8HImode:
6773 case V4SImode:
6774 case V2DImode:
6775 case V4SFmode:
6776 case V2DFmode:
6777 if (!type || !AGGREGATE_TYPE_P (type))
6779 if (!TARGET_SSE && !warnedsse && cum->warn_sse)
6781 warnedsse = true;
6782 warning (0, "SSE vector argument without SSE enabled "
6783 "changes the ABI");
6785 if (cum->sse_nregs)
6786 return gen_reg_or_parallel (mode, orig_mode,
6787 cum->sse_regno + FIRST_SSE_REG);
6789 break;
6791 case OImode:
6792 /* OImode shouldn't be used directly. */
6793 gcc_unreachable ();
6795 case V8SFmode:
6796 case V8SImode:
6797 case V32QImode:
6798 case V16HImode:
6799 case V4DFmode:
6800 case V4DImode:
6801 if (!type || !AGGREGATE_TYPE_P (type))
6803 if (cum->sse_nregs)
6804 return gen_reg_or_parallel (mode, orig_mode,
6805 cum->sse_regno + FIRST_SSE_REG);
6807 break;
6809 case V8QImode:
6810 case V4HImode:
6811 case V2SImode:
6812 case V2SFmode:
6813 case V1TImode:
6814 case V1DImode:
6815 if (!type || !AGGREGATE_TYPE_P (type))
6817 if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
6819 warnedmmx = true;
6820 warning (0, "MMX vector argument without MMX enabled "
6821 "changes the ABI");
6823 if (cum->mmx_nregs)
6824 return gen_reg_or_parallel (mode, orig_mode,
6825 cum->mmx_regno + FIRST_MMX_REG);
6827 break;
6830 return NULL_RTX;
6833 static rtx
6834 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6835 enum machine_mode orig_mode, const_tree type, bool named)
6837 /* Handle a hidden AL argument containing number of registers
6838 for varargs x86-64 functions. */
6839 if (mode == VOIDmode)
6840 return GEN_INT (cum->maybe_vaarg
6841 ? (cum->sse_nregs < 0
6842 ? X86_64_SSE_REGPARM_MAX
6843 : cum->sse_regno)
6844 : -1);
6846 switch (mode)
6848 default:
6849 break;
6851 case V8SFmode:
6852 case V8SImode:
6853 case V32QImode:
6854 case V16HImode:
6855 case V4DFmode:
6856 case V4DImode:
6857 /* Unnamed 256bit vector mode parameters are passed on stack. */
6858 if (!named)
6859 return NULL;
6860 break;
6863 return construct_container (mode, orig_mode, type, 0, cum->nregs,
6864 cum->sse_nregs,
6865 &x86_64_int_parameter_registers [cum->regno],
6866 cum->sse_regno);
6869 static rtx
6870 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6871 enum machine_mode orig_mode, bool named,
6872 HOST_WIDE_INT bytes)
6874 unsigned int regno;
6876 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
6877 We use value of -2 to specify that current function call is MSABI. */
6878 if (mode == VOIDmode)
6879 return GEN_INT (-2);
6881 /* If we've run out of registers, it goes on the stack. */
6882 if (cum->nregs == 0)
6883 return NULL_RTX;
6885 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
6887 /* Only floating point modes are passed in anything but integer regs. */
6888 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
6890 if (named)
6891 regno = cum->regno + FIRST_SSE_REG;
6892 else
6894 rtx t1, t2;
6896 /* Unnamed floating parameters are passed in both the
6897 SSE and integer registers. */
6898 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
6899 t2 = gen_rtx_REG (mode, regno);
6900 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
6901 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
6902 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
6905 /* Handle aggregated types passed in register. */
6906 if (orig_mode == BLKmode)
6908 if (bytes > 0 && bytes <= 8)
6909 mode = (bytes > 4 ? DImode : SImode);
6910 if (mode == BLKmode)
6911 mode = DImode;
6914 return gen_reg_or_parallel (mode, orig_mode, regno);
6917 /* Return where to put the arguments to a function.
6918 Return zero to push the argument on the stack, or a hard register in which to store the argument.
6920 MODE is the argument's machine mode. TYPE is the data type of the
6921 argument. It is null for libcalls where that information may not be
6922 available. CUM gives information about the preceding args and about
6923 the function being called. NAMED is nonzero if this argument is a
6924 named parameter (otherwise it is an extra parameter matching an
6925 ellipsis). */
6927 static rtx
6928 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
6929 const_tree type, bool named)
6931 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6932 enum machine_mode mode = omode;
6933 HOST_WIDE_INT bytes, words;
6934 rtx arg;
6936 if (mode == BLKmode)
6937 bytes = int_size_in_bytes (type);
6938 else
6939 bytes = GET_MODE_SIZE (mode);
6940 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6942 /* To simplify the code below, represent vector types with a vector mode
6943 even if MMX/SSE are not active. */
6944 if (type && TREE_CODE (type) == VECTOR_TYPE)
6945 mode = type_natural_mode (type, cum);
6947 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6948 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
6949 else if (TARGET_64BIT)
6950 arg = function_arg_64 (cum, mode, omode, type, named);
6951 else
6952 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
6954 return arg;
6957 /* A C expression that indicates when an argument must be passed by
6958 reference. If nonzero for an argument, a copy of that argument is
6959 made in memory and a pointer to the argument is passed instead of
6960 the argument itself. The pointer is passed in whatever way is
6961 appropriate for passing a pointer to that type. */
6963 static bool
6964 ix86_pass_by_reference (cumulative_args_t cum_v ATTRIBUTE_UNUSED,
6965 enum machine_mode mode ATTRIBUTE_UNUSED,
6966 const_tree type, bool named ATTRIBUTE_UNUSED)
6968 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6970 /* See Windows x64 Software Convention. */
6971 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6973 int msize = (int) GET_MODE_SIZE (mode);
6974 if (type)
6976 /* Arrays are passed by reference. */
6977 if (TREE_CODE (type) == ARRAY_TYPE)
6978 return true;
6980 if (AGGREGATE_TYPE_P (type))
6982 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
6983 are passed by reference. */
6984 msize = int_size_in_bytes (type);
6988 /* __m128 is passed by reference. */
6989 switch (msize) {
6990 case 1: case 2: case 4: case 8:
6991 break;
6992 default:
6993 return true;
6996 else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
6997 return 1;
6999 return 0;
7002 /* Return true when TYPE should be 128bit aligned for 32bit argument
7003 passing ABI. XXX: This function is obsolete and is only used for
7004 checking psABI compatibility with previous versions of GCC. */
7006 static bool
7007 ix86_compat_aligned_value_p (const_tree type)
7009 enum machine_mode mode = TYPE_MODE (type);
7010 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
7011 || mode == TDmode
7012 || mode == TFmode
7013 || mode == TCmode)
7014 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
7015 return true;
7016 if (TYPE_ALIGN (type) < 128)
7017 return false;
7019 if (AGGREGATE_TYPE_P (type))
7021 /* Walk the aggregates recursively. */
7022 switch (TREE_CODE (type))
7024 case RECORD_TYPE:
7025 case UNION_TYPE:
7026 case QUAL_UNION_TYPE:
7028 tree field;
7030 /* Walk all the structure fields. */
7031 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7033 if (TREE_CODE (field) == FIELD_DECL
7034 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
7035 return true;
7037 break;
7040 case ARRAY_TYPE:
7041 /* Just for use if some languages passes arrays by value. */
7042 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
7043 return true;
7044 break;
7046 default:
7047 gcc_unreachable ();
7050 return false;
7053 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
7054 XXX: This function is obsolete and is only used for checking psABI
7055 compatibility with previous versions of GCC. */
7057 static unsigned int
7058 ix86_compat_function_arg_boundary (enum machine_mode mode,
7059 const_tree type, unsigned int align)
7061 /* In 32bit, only _Decimal128 and __float128 are aligned to their
7062 natural boundaries. */
7063 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
7065 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
7066 make an exception for SSE modes since these require 128bit
7067 alignment.
7069 The handling here differs from field_alignment. ICC aligns MMX
7070 arguments to 4 byte boundaries, while structure fields are aligned
7071 to 8 byte boundaries. */
7072 if (!type)
7074 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
7075 align = PARM_BOUNDARY;
7077 else
7079 if (!ix86_compat_aligned_value_p (type))
7080 align = PARM_BOUNDARY;
7083 if (align > BIGGEST_ALIGNMENT)
7084 align = BIGGEST_ALIGNMENT;
7085 return align;
7088 /* Return true when TYPE should be 128bit aligned for 32bit argument
7089 passing ABI. */
7091 static bool
7092 ix86_contains_aligned_value_p (const_tree type)
7094 enum machine_mode mode = TYPE_MODE (type);
7096 if (mode == XFmode || mode == XCmode)
7097 return false;
7099 if (TYPE_ALIGN (type) < 128)
7100 return false;
7102 if (AGGREGATE_TYPE_P (type))
7104 /* Walk the aggregates recursively. */
7105 switch (TREE_CODE (type))
7107 case RECORD_TYPE:
7108 case UNION_TYPE:
7109 case QUAL_UNION_TYPE:
7111 tree field;
7113 /* Walk all the structure fields. */
7114 for (field = TYPE_FIELDS (type);
7115 field;
7116 field = DECL_CHAIN (field))
7118 if (TREE_CODE (field) == FIELD_DECL
7119 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
7120 return true;
7122 break;
7125 case ARRAY_TYPE:
7126 /* Just for use if some languages passes arrays by value. */
7127 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
7128 return true;
7129 break;
7131 default:
7132 gcc_unreachable ();
7135 else
7136 return TYPE_ALIGN (type) >= 128;
7138 return false;
7141 /* Gives the alignment boundary, in bits, of an argument with the
7142 specified mode and type. */
7144 static unsigned int
7145 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
7147 unsigned int align;
7148 if (type)
7150 /* Since the main variant type is used for call, we convert it to
7151 the main variant type. */
7152 type = TYPE_MAIN_VARIANT (type);
7153 align = TYPE_ALIGN (type);
7155 else
7156 align = GET_MODE_ALIGNMENT (mode);
7157 if (align < PARM_BOUNDARY)
7158 align = PARM_BOUNDARY;
7159 else
7161 static bool warned;
7162 unsigned int saved_align = align;
7164 if (!TARGET_64BIT)
7166 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
7167 if (!type)
7169 if (mode == XFmode || mode == XCmode)
7170 align = PARM_BOUNDARY;
7172 else if (!ix86_contains_aligned_value_p (type))
7173 align = PARM_BOUNDARY;
7175 if (align < 128)
7176 align = PARM_BOUNDARY;
7179 if (warn_psabi
7180 && !warned
7181 && align != ix86_compat_function_arg_boundary (mode, type,
7182 saved_align))
7184 warned = true;
7185 inform (input_location,
7186 "The ABI for passing parameters with %d-byte"
7187 " alignment has changed in GCC 4.6",
7188 align / BITS_PER_UNIT);
7192 return align;
7195 /* Return true if N is a possible register number of function value. */
7197 static bool
7198 ix86_function_value_regno_p (const unsigned int regno)
7200 switch (regno)
7202 case AX_REG:
7203 return true;
7205 case FIRST_FLOAT_REG:
7206 /* TODO: The function should depend on current function ABI but
7207 builtins.c would need updating then. Therefore we use the
7208 default ABI. */
7209 if (TARGET_64BIT && ix86_abi == MS_ABI)
7210 return false;
7211 return TARGET_FLOAT_RETURNS_IN_80387;
7213 case FIRST_SSE_REG:
7214 return TARGET_SSE;
7216 case FIRST_MMX_REG:
7217 if (TARGET_MACHO || TARGET_64BIT)
7218 return false;
7219 return TARGET_MMX;
7222 return false;
7225 /* Define how to find the value returned by a function.
7226 VALTYPE is the data type of the value (as a tree).
7227 If the precise function being called is known, FUNC is its FUNCTION_DECL;
7228 otherwise, FUNC is 0. */
7230 static rtx
7231 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7232 const_tree fntype, const_tree fn)
7234 unsigned int regno;
7236 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7237 we normally prevent this case when mmx is not available. However
7238 some ABIs may require the result to be returned like DImode. */
7239 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7240 regno = FIRST_MMX_REG;
7242 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
7243 we prevent this case when sse is not available. However some ABIs
7244 may require the result to be returned like integer TImode. */
7245 else if (mode == TImode
7246 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7247 regno = FIRST_SSE_REG;
7249 /* 32-byte vector modes in %ymm0. */
7250 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7251 regno = FIRST_SSE_REG;
7253 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
7254 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7255 regno = FIRST_FLOAT_REG;
7256 else
7257 /* Most things go in %eax. */
7258 regno = AX_REG;
7260 /* Override FP return register with %xmm0 for local functions when
7261 SSE math is enabled or for functions with sseregparm attribute. */
7262 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7264 int sse_level = ix86_function_sseregparm (fntype, fn, false);
7265 if ((sse_level >= 1 && mode == SFmode)
7266 || (sse_level == 2 && mode == DFmode))
7267 regno = FIRST_SSE_REG;
7270 /* OImode shouldn't be used directly. */
7271 gcc_assert (mode != OImode);
7273 return gen_rtx_REG (orig_mode, regno);
7276 static rtx
7277 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7278 const_tree valtype)
7280 rtx ret;
7282 /* Handle libcalls, which don't provide a type node. */
7283 if (valtype == NULL)
7285 unsigned int regno;
7287 switch (mode)
7289 case SFmode:
7290 case SCmode:
7291 case DFmode:
7292 case DCmode:
7293 case TFmode:
7294 case SDmode:
7295 case DDmode:
7296 case TDmode:
7297 regno = FIRST_SSE_REG;
7298 break;
7299 case XFmode:
7300 case XCmode:
7301 regno = FIRST_FLOAT_REG;
7302 break;
7303 case TCmode:
7304 return NULL;
7305 default:
7306 regno = AX_REG;
7309 return gen_rtx_REG (mode, regno);
7311 else if (POINTER_TYPE_P (valtype)
7312 && !upc_shared_type_p (TREE_TYPE (valtype)))
7314 /* Pointers are always returned in word_mode. */
7315 mode = word_mode;
7318 ret = construct_container (mode, orig_mode, valtype, 1,
7319 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7320 x86_64_int_return_registers, 0);
7322 /* For zero sized structures, construct_container returns NULL, but we
7323 need to keep rest of compiler happy by returning meaningful value. */
7324 if (!ret)
7325 ret = gen_rtx_REG (orig_mode, AX_REG);
7327 return ret;
7330 static rtx
7331 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode)
7333 unsigned int regno = AX_REG;
7335 if (TARGET_SSE)
7337 switch (GET_MODE_SIZE (mode))
7339 case 16:
7340 if((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7341 && !COMPLEX_MODE_P (mode))
7342 regno = FIRST_SSE_REG;
7343 break;
7344 case 8:
7345 case 4:
7346 if (mode == SFmode || mode == DFmode)
7347 regno = FIRST_SSE_REG;
7348 break;
7349 default:
7350 break;
7353 return gen_rtx_REG (orig_mode, regno);
7356 static rtx
7357 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7358 enum machine_mode orig_mode, enum machine_mode mode)
7360 const_tree fn, fntype;
7362 fn = NULL_TREE;
7363 if (fntype_or_decl && DECL_P (fntype_or_decl))
7364 fn = fntype_or_decl;
7365 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7367 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7368 return function_value_ms_64 (orig_mode, mode);
7369 else if (TARGET_64BIT)
7370 return function_value_64 (orig_mode, mode, valtype);
7371 else
7372 return function_value_32 (orig_mode, mode, fntype, fn);
7375 static rtx
7376 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7377 bool outgoing ATTRIBUTE_UNUSED)
7379 enum machine_mode mode, orig_mode;
7381 orig_mode = TYPE_MODE (valtype);
7382 mode = type_natural_mode (valtype, NULL);
7383 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7386 /* Pointer function arguments and return values are promoted to
7387 word_mode. */
7389 static enum machine_mode
7390 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7391 int *punsignedp, const_tree fntype,
7392 int for_return)
7394 if (type != NULL_TREE && POINTER_TYPE_P (type))
7396 if (upc_shared_type_p (TREE_TYPE (type)))
7398 *punsignedp = 1;
7399 return TYPE_MODE (upc_pts_rep_type_node);
7401 *punsignedp = POINTERS_EXTEND_UNSIGNED;
7402 return word_mode;
7404 return default_promote_function_mode (type, mode, punsignedp, fntype,
7405 for_return);
7408 /* Return true if a structure, union or array with MODE containing FIELD
7409 should be accessed using BLKmode. */
7411 static bool
7412 ix86_member_type_forces_blk (const_tree field, enum machine_mode mode)
7414 /* Union with XFmode must be in BLKmode. */
7415 return (mode == XFmode
7416 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
7417 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
7421 ix86_libcall_value (enum machine_mode mode)
7423 return ix86_function_value_1 (NULL, NULL, mode, mode);
7426 /* Return true iff type is returned in memory. */
7428 static bool ATTRIBUTE_UNUSED
7429 return_in_memory_32 (const_tree type, enum machine_mode mode)
7431 HOST_WIDE_INT size;
7433 if (mode == BLKmode)
7434 return true;
7436 size = int_size_in_bytes (type);
7438 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7439 return false;
7441 if (VECTOR_MODE_P (mode) || mode == TImode)
7443 /* User-created vectors small enough to fit in EAX. */
7444 if (size < 8)
7445 return false;
7447 /* MMX/3dNow values are returned in MM0,
7448 except when it doesn't exits or the ABI prescribes otherwise. */
7449 if (size == 8)
7450 return !TARGET_MMX || TARGET_VECT8_RETURNS;
7452 /* SSE values are returned in XMM0, except when it doesn't exist. */
7453 if (size == 16)
7454 return !TARGET_SSE;
7456 /* AVX values are returned in YMM0, except when it doesn't exist. */
7457 if (size == 32)
7458 return !TARGET_AVX;
7461 if (mode == XFmode)
7462 return false;
7464 if (size > 12)
7465 return true;
7467 /* OImode shouldn't be used directly. */
7468 gcc_assert (mode != OImode);
7470 return false;
7473 static bool ATTRIBUTE_UNUSED
7474 return_in_memory_64 (const_tree type, enum machine_mode mode)
7476 int needed_intregs, needed_sseregs;
7477 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7480 static bool ATTRIBUTE_UNUSED
7481 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7483 HOST_WIDE_INT size = int_size_in_bytes (type);
7485 /* __m128 is returned in xmm0. */
7486 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7487 && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7488 return false;
7490 /* Otherwise, the size must be exactly in [1248]. */
7491 return size != 1 && size != 2 && size != 4 && size != 8;
7494 static bool
7495 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7497 #ifdef SUBTARGET_RETURN_IN_MEMORY
7498 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7499 #else
7500 const enum machine_mode mode = type_natural_mode (type, NULL);
7502 if (TARGET_64BIT)
7504 if (ix86_function_type_abi (fntype) == MS_ABI)
7505 return return_in_memory_ms_64 (type, mode);
7506 else
7507 return return_in_memory_64 (type, mode);
7509 else
7510 return return_in_memory_32 (type, mode);
7511 #endif
7514 /* When returning SSE vector types, we have a choice of either
7515 (1) being abi incompatible with a -march switch, or
7516 (2) generating an error.
7517 Given no good solution, I think the safest thing is one warning.
7518 The user won't be able to use -Werror, but....
7520 Choose the STRUCT_VALUE_RTX hook because that's (at present) only
7521 called in response to actually generating a caller or callee that
7522 uses such a type. As opposed to TARGET_RETURN_IN_MEMORY, which is called
7523 via aggregate_value_p for general type probing from tree-ssa. */
7525 static rtx
7526 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
7528 static bool warnedsse, warnedmmx;
7530 if (!TARGET_64BIT && type)
7532 /* Look at the return type of the function, not the function type. */
7533 enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
7535 if (!TARGET_SSE && !warnedsse)
7537 if (mode == TImode
7538 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7540 warnedsse = true;
7541 warning (0, "SSE vector return without SSE enabled "
7542 "changes the ABI");
7546 if (!TARGET_MMX && !warnedmmx)
7548 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7550 warnedmmx = true;
7551 warning (0, "MMX vector return without MMX enabled "
7552 "changes the ABI");
7557 return NULL;
7561 /* Create the va_list data type. */
7563 /* Returns the calling convention specific va_list date type.
7564 The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */
7566 static tree
7567 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7569 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7571 /* For i386 we use plain pointer to argument area. */
7572 if (!TARGET_64BIT || abi == MS_ABI)
7573 return build_pointer_type (char_type_node);
7575 record = lang_hooks.types.make_type (RECORD_TYPE);
7576 type_decl = build_decl (BUILTINS_LOCATION,
7577 TYPE_DECL, get_identifier ("__va_list_tag"), record);
7579 f_gpr = build_decl (BUILTINS_LOCATION,
7580 FIELD_DECL, get_identifier ("gp_offset"),
7581 unsigned_type_node);
7582 f_fpr = build_decl (BUILTINS_LOCATION,
7583 FIELD_DECL, get_identifier ("fp_offset"),
7584 unsigned_type_node);
7585 f_ovf = build_decl (BUILTINS_LOCATION,
7586 FIELD_DECL, get_identifier ("overflow_arg_area"),
7587 ptr_type_node);
7588 f_sav = build_decl (BUILTINS_LOCATION,
7589 FIELD_DECL, get_identifier ("reg_save_area"),
7590 ptr_type_node);
7592 va_list_gpr_counter_field = f_gpr;
7593 va_list_fpr_counter_field = f_fpr;
7595 DECL_FIELD_CONTEXT (f_gpr) = record;
7596 DECL_FIELD_CONTEXT (f_fpr) = record;
7597 DECL_FIELD_CONTEXT (f_ovf) = record;
7598 DECL_FIELD_CONTEXT (f_sav) = record;
7600 TYPE_STUB_DECL (record) = type_decl;
7601 TYPE_NAME (record) = type_decl;
7602 TYPE_FIELDS (record) = f_gpr;
7603 DECL_CHAIN (f_gpr) = f_fpr;
7604 DECL_CHAIN (f_fpr) = f_ovf;
7605 DECL_CHAIN (f_ovf) = f_sav;
7607 layout_type (record);
7609 /* The correct type is an array type of one element. */
7610 return build_array_type (record, build_index_type (size_zero_node));
7613 /* Setup the builtin va_list data type and for 64-bit the additional
7614 calling convention specific va_list data types. */
7616 static tree
7617 ix86_build_builtin_va_list (void)
7619 tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
7621 /* Initialize abi specific va_list builtin types. */
7622 if (TARGET_64BIT)
7624 tree t;
7625 if (ix86_abi == MS_ABI)
7627 t = ix86_build_builtin_va_list_abi (SYSV_ABI);
7628 if (TREE_CODE (t) != RECORD_TYPE)
7629 t = build_variant_type_copy (t);
7630 sysv_va_list_type_node = t;
7632 else
7634 t = ret;
7635 if (TREE_CODE (t) != RECORD_TYPE)
7636 t = build_variant_type_copy (t);
7637 sysv_va_list_type_node = t;
7639 if (ix86_abi != MS_ABI)
7641 t = ix86_build_builtin_va_list_abi (MS_ABI);
7642 if (TREE_CODE (t) != RECORD_TYPE)
7643 t = build_variant_type_copy (t);
7644 ms_va_list_type_node = t;
7646 else
7648 t = ret;
7649 if (TREE_CODE (t) != RECORD_TYPE)
7650 t = build_variant_type_copy (t);
7651 ms_va_list_type_node = t;
7655 return ret;
7658 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
7660 static void
7661 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
7663 rtx save_area, mem;
7664 alias_set_type set;
7665 int i, max;
7667 /* GPR size of varargs save area. */
7668 if (cfun->va_list_gpr_size)
7669 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
7670 else
7671 ix86_varargs_gpr_size = 0;
7673 /* FPR size of varargs save area. We don't need it if we don't pass
7674 anything in SSE registers. */
7675 if (TARGET_SSE && cfun->va_list_fpr_size)
7676 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
7677 else
7678 ix86_varargs_fpr_size = 0;
7680 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
7681 return;
7683 save_area = frame_pointer_rtx;
7684 set = get_varargs_alias_set ();
7686 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
7687 if (max > X86_64_REGPARM_MAX)
7688 max = X86_64_REGPARM_MAX;
7690 for (i = cum->regno; i < max; i++)
7692 mem = gen_rtx_MEM (word_mode,
7693 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
7694 MEM_NOTRAP_P (mem) = 1;
7695 set_mem_alias_set (mem, set);
7696 emit_move_insn (mem,
7697 gen_rtx_REG (word_mode,
7698 x86_64_int_parameter_registers[i]));
7701 if (ix86_varargs_fpr_size)
7703 enum machine_mode smode;
7704 rtx label, test;
7706 /* Now emit code to save SSE registers. The AX parameter contains number
7707 of SSE parameter registers used to call this function, though all we
7708 actually check here is the zero/non-zero status. */
7710 label = gen_label_rtx ();
7711 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
7712 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
7713 label));
7715 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
7716 we used movdqa (i.e. TImode) instead? Perhaps even better would
7717 be if we could determine the real mode of the data, via a hook
7718 into pass_stdarg. Ignore all that for now. */
7719 smode = V4SFmode;
7720 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
7721 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
7723 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
7724 if (max > X86_64_SSE_REGPARM_MAX)
7725 max = X86_64_SSE_REGPARM_MAX;
7727 for (i = cum->sse_regno; i < max; ++i)
7729 mem = plus_constant (Pmode, save_area,
7730 i * 16 + ix86_varargs_gpr_size);
7731 mem = gen_rtx_MEM (smode, mem);
7732 MEM_NOTRAP_P (mem) = 1;
7733 set_mem_alias_set (mem, set);
7734 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
7736 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
7739 emit_label (label);
7743 static void
7744 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
7746 alias_set_type set = get_varargs_alias_set ();
7747 int i;
7749 /* Reset to zero, as there might be a sysv vaarg used
7750 before. */
7751 ix86_varargs_gpr_size = 0;
7752 ix86_varargs_fpr_size = 0;
7754 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
7756 rtx reg, mem;
7758 mem = gen_rtx_MEM (Pmode,
7759 plus_constant (Pmode, virtual_incoming_args_rtx,
7760 i * UNITS_PER_WORD));
7761 MEM_NOTRAP_P (mem) = 1;
7762 set_mem_alias_set (mem, set);
7764 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
7765 emit_move_insn (mem, reg);
7769 static void
7770 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
7771 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7772 int no_rtl)
7774 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7775 CUMULATIVE_ARGS next_cum;
7776 tree fntype;
7778 /* This argument doesn't appear to be used anymore. Which is good,
7779 because the old code here didn't suppress rtl generation. */
7780 gcc_assert (!no_rtl);
7782 if (!TARGET_64BIT)
7783 return;
7785 fntype = TREE_TYPE (current_function_decl);
7787 /* For varargs, we do not want to skip the dummy va_dcl argument.
7788 For stdargs, we do want to skip the last named argument. */
7789 next_cum = *cum;
7790 if (stdarg_p (fntype))
7791 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
7792 true);
7794 if (cum->call_abi == MS_ABI)
7795 setup_incoming_varargs_ms_64 (&next_cum);
7796 else
7797 setup_incoming_varargs_64 (&next_cum);
7800 /* Checks if TYPE is of kind va_list char *. */
7802 static bool
7803 is_va_list_char_pointer (tree type)
7805 tree canonic;
7807 /* For 32-bit it is always true. */
7808 if (!TARGET_64BIT)
7809 return true;
7810 canonic = ix86_canonical_va_list_type (type);
7811 return (canonic == ms_va_list_type_node
7812 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
7815 /* Implement va_start. */
7817 static void
7818 ix86_va_start (tree valist, rtx nextarg)
7820 HOST_WIDE_INT words, n_gpr, n_fpr;
7821 tree f_gpr, f_fpr, f_ovf, f_sav;
7822 tree gpr, fpr, ovf, sav, t;
7823 tree type;
7824 rtx ovf_rtx;
7826 if (flag_split_stack
7827 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7829 unsigned int scratch_regno;
7831 /* When we are splitting the stack, we can't refer to the stack
7832 arguments using internal_arg_pointer, because they may be on
7833 the old stack. The split stack prologue will arrange to
7834 leave a pointer to the old stack arguments in a scratch
7835 register, which we here copy to a pseudo-register. The split
7836 stack prologue can't set the pseudo-register directly because
7837 it (the prologue) runs before any registers have been saved. */
7839 scratch_regno = split_stack_prologue_scratch_regno ();
7840 if (scratch_regno != INVALID_REGNUM)
7842 rtx reg, seq;
7844 reg = gen_reg_rtx (Pmode);
7845 cfun->machine->split_stack_varargs_pointer = reg;
7847 start_sequence ();
7848 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
7849 seq = get_insns ();
7850 end_sequence ();
7852 push_topmost_sequence ();
7853 emit_insn_after (seq, entry_of_function ());
7854 pop_topmost_sequence ();
7858 /* Only 64bit target needs something special. */
7859 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7861 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7862 std_expand_builtin_va_start (valist, nextarg);
7863 else
7865 rtx va_r, next;
7867 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
7868 next = expand_binop (ptr_mode, add_optab,
7869 cfun->machine->split_stack_varargs_pointer,
7870 crtl->args.arg_offset_rtx,
7871 NULL_RTX, 0, OPTAB_LIB_WIDEN);
7872 convert_move (va_r, next, 0);
7874 return;
7877 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7878 f_fpr = DECL_CHAIN (f_gpr);
7879 f_ovf = DECL_CHAIN (f_fpr);
7880 f_sav = DECL_CHAIN (f_ovf);
7882 valist = build_simple_mem_ref (valist);
7883 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
7884 /* The following should be folded into the MEM_REF offset. */
7885 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
7886 f_gpr, NULL_TREE);
7887 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
7888 f_fpr, NULL_TREE);
7889 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
7890 f_ovf, NULL_TREE);
7891 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
7892 f_sav, NULL_TREE);
7894 /* Count number of gp and fp argument registers used. */
7895 words = crtl->args.info.words;
7896 n_gpr = crtl->args.info.regno;
7897 n_fpr = crtl->args.info.sse_regno;
7899 if (cfun->va_list_gpr_size)
7901 type = TREE_TYPE (gpr);
7902 t = build2 (MODIFY_EXPR, type,
7903 gpr, build_int_cst (type, n_gpr * 8));
7904 TREE_SIDE_EFFECTS (t) = 1;
7905 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7908 if (TARGET_SSE && cfun->va_list_fpr_size)
7910 type = TREE_TYPE (fpr);
7911 t = build2 (MODIFY_EXPR, type, fpr,
7912 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
7913 TREE_SIDE_EFFECTS (t) = 1;
7914 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7917 /* Find the overflow area. */
7918 type = TREE_TYPE (ovf);
7919 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7920 ovf_rtx = crtl->args.internal_arg_pointer;
7921 else
7922 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
7923 t = make_tree (type, ovf_rtx);
7924 if (words != 0)
7925 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
7926 t = build2 (MODIFY_EXPR, type, ovf, t);
7927 TREE_SIDE_EFFECTS (t) = 1;
7928 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7930 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
7932 /* Find the register save area.
7933 Prologue of the function save it right above stack frame. */
7934 type = TREE_TYPE (sav);
7935 t = make_tree (type, frame_pointer_rtx);
7936 if (!ix86_varargs_gpr_size)
7937 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
7938 t = build2 (MODIFY_EXPR, type, sav, t);
7939 TREE_SIDE_EFFECTS (t) = 1;
7940 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7944 /* Implement va_arg. */
7946 static tree
7947 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
7948 gimple_seq *post_p)
7950 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
7951 tree f_gpr, f_fpr, f_ovf, f_sav;
7952 tree gpr, fpr, ovf, sav, t;
7953 int size, rsize;
7954 tree lab_false, lab_over = NULL_TREE;
7955 tree addr, t2;
7956 rtx container;
7957 int indirect_p = 0;
7958 tree ptrtype;
7959 enum machine_mode nat_mode;
7960 unsigned int arg_boundary;
7962 /* Only 64bit target needs something special. */
7963 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7964 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
7966 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7967 f_fpr = DECL_CHAIN (f_gpr);
7968 f_ovf = DECL_CHAIN (f_fpr);
7969 f_sav = DECL_CHAIN (f_ovf);
7971 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
7972 build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
7973 valist = build_va_arg_indirect_ref (valist);
7974 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
7975 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
7976 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
7978 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7979 if (indirect_p)
7980 type = build_pointer_type (type);
7981 size = int_size_in_bytes (type);
7982 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7984 nat_mode = type_natural_mode (type, NULL);
7985 switch (nat_mode)
7987 case V8SFmode:
7988 case V8SImode:
7989 case V32QImode:
7990 case V16HImode:
7991 case V4DFmode:
7992 case V4DImode:
7993 /* Unnamed 256bit vector mode parameters are passed on stack. */
7994 if (!TARGET_64BIT_MS_ABI)
7996 container = NULL;
7997 break;
8000 default:
8001 container = construct_container (nat_mode, TYPE_MODE (type),
8002 type, 0, X86_64_REGPARM_MAX,
8003 X86_64_SSE_REGPARM_MAX, intreg,
8005 break;
8008 /* Pull the value out of the saved registers. */
8010 addr = create_tmp_var (ptr_type_node, "addr");
8012 if (container)
8014 int needed_intregs, needed_sseregs;
8015 bool need_temp;
8016 tree int_addr, sse_addr;
8018 lab_false = create_artificial_label (UNKNOWN_LOCATION);
8019 lab_over = create_artificial_label (UNKNOWN_LOCATION);
8021 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
8023 need_temp = (!REG_P (container)
8024 && ((needed_intregs && TYPE_ALIGN (type) > 64)
8025 || TYPE_ALIGN (type) > 128));
8027 /* In case we are passing structure, verify that it is consecutive block
8028 on the register save area. If not we need to do moves. */
8029 if (!need_temp && !REG_P (container))
8031 /* Verify that all registers are strictly consecutive */
8032 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
8034 int i;
8036 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8038 rtx slot = XVECEXP (container, 0, i);
8039 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
8040 || INTVAL (XEXP (slot, 1)) != i * 16)
8041 need_temp = 1;
8044 else
8046 int i;
8048 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8050 rtx slot = XVECEXP (container, 0, i);
8051 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
8052 || INTVAL (XEXP (slot, 1)) != i * 8)
8053 need_temp = 1;
8057 if (!need_temp)
8059 int_addr = addr;
8060 sse_addr = addr;
8062 else
8064 int_addr = create_tmp_var (ptr_type_node, "int_addr");
8065 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
8068 /* First ensure that we fit completely in registers. */
8069 if (needed_intregs)
8071 t = build_int_cst (TREE_TYPE (gpr),
8072 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
8073 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
8074 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8075 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8076 gimplify_and_add (t, pre_p);
8078 if (needed_sseregs)
8080 t = build_int_cst (TREE_TYPE (fpr),
8081 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
8082 + X86_64_REGPARM_MAX * 8);
8083 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
8084 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8085 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8086 gimplify_and_add (t, pre_p);
8089 /* Compute index to start of area used for integer regs. */
8090 if (needed_intregs)
8092 /* int_addr = gpr + sav; */
8093 t = fold_build_pointer_plus (sav, gpr);
8094 gimplify_assign (int_addr, t, pre_p);
8096 if (needed_sseregs)
8098 /* sse_addr = fpr + sav; */
8099 t = fold_build_pointer_plus (sav, fpr);
8100 gimplify_assign (sse_addr, t, pre_p);
8102 if (need_temp)
8104 int i, prev_size = 0;
8105 tree temp = create_tmp_var (type, "va_arg_tmp");
8107 /* addr = &temp; */
8108 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
8109 gimplify_assign (addr, t, pre_p);
8111 for (i = 0; i < XVECLEN (container, 0); i++)
8113 rtx slot = XVECEXP (container, 0, i);
8114 rtx reg = XEXP (slot, 0);
8115 enum machine_mode mode = GET_MODE (reg);
8116 tree piece_type;
8117 tree addr_type;
8118 tree daddr_type;
8119 tree src_addr, src;
8120 int src_offset;
8121 tree dest_addr, dest;
8122 int cur_size = GET_MODE_SIZE (mode);
8124 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
8125 prev_size = INTVAL (XEXP (slot, 1));
8126 if (prev_size + cur_size > size)
8128 cur_size = size - prev_size;
8129 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
8130 if (mode == BLKmode)
8131 mode = QImode;
8133 piece_type = lang_hooks.types.type_for_mode (mode, 1);
8134 if (mode == GET_MODE (reg))
8135 addr_type = build_pointer_type (piece_type);
8136 else
8137 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8138 true);
8139 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8140 true);
8142 if (SSE_REGNO_P (REGNO (reg)))
8144 src_addr = sse_addr;
8145 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
8147 else
8149 src_addr = int_addr;
8150 src_offset = REGNO (reg) * 8;
8152 src_addr = fold_convert (addr_type, src_addr);
8153 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
8155 dest_addr = fold_convert (daddr_type, addr);
8156 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
8157 if (cur_size == GET_MODE_SIZE (mode))
8159 src = build_va_arg_indirect_ref (src_addr);
8160 dest = build_va_arg_indirect_ref (dest_addr);
8162 gimplify_assign (dest, src, pre_p);
8164 else
8166 tree copy
8167 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
8168 3, dest_addr, src_addr,
8169 size_int (cur_size));
8170 gimplify_and_add (copy, pre_p);
8172 prev_size += cur_size;
8176 if (needed_intregs)
8178 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
8179 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
8180 gimplify_assign (gpr, t, pre_p);
8183 if (needed_sseregs)
8185 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
8186 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
8187 gimplify_assign (fpr, t, pre_p);
8190 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
8192 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
8195 /* ... otherwise out of the overflow area. */
8197 /* When we align parameter on stack for caller, if the parameter
8198 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
8199 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
8200 here with caller. */
8201 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
8202 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
8203 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
8205 /* Care for on-stack alignment if needed. */
8206 if (arg_boundary <= 64 || size == 0)
8207 t = ovf;
8208 else
8210 HOST_WIDE_INT align = arg_boundary / 8;
8211 t = fold_build_pointer_plus_hwi (ovf, align - 1);
8212 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8213 build_int_cst (TREE_TYPE (t), -align));
8216 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
8217 gimplify_assign (addr, t, pre_p);
8219 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
8220 gimplify_assign (unshare_expr (ovf), t, pre_p);
8222 if (container)
8223 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
8225 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
8226 addr = fold_convert (ptrtype, addr);
8228 if (indirect_p)
8229 addr = build_va_arg_indirect_ref (addr);
8230 return build_va_arg_indirect_ref (addr);
8233 /* Return true if OPNUM's MEM should be matched
8234 in movabs* patterns. */
8236 bool
8237 ix86_check_movabs (rtx insn, int opnum)
8239 rtx set, mem;
8241 set = PATTERN (insn);
8242 if (GET_CODE (set) == PARALLEL)
8243 set = XVECEXP (set, 0, 0);
8244 gcc_assert (GET_CODE (set) == SET);
8245 mem = XEXP (set, opnum);
8246 while (GET_CODE (mem) == SUBREG)
8247 mem = SUBREG_REG (mem);
8248 gcc_assert (MEM_P (mem));
8249 return volatile_ok || !MEM_VOLATILE_P (mem);
8252 /* Initialize the table of extra 80387 mathematical constants. */
8254 static void
8255 init_ext_80387_constants (void)
8257 static const char * cst[5] =
8259 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
8260 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
8261 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
8262 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
8263 "3.1415926535897932385128089594061862044", /* 4: fldpi */
8265 int i;
8267 for (i = 0; i < 5; i++)
8269 real_from_string (&ext_80387_constants_table[i], cst[i]);
8270 /* Ensure each constant is rounded to XFmode precision. */
8271 real_convert (&ext_80387_constants_table[i],
8272 XFmode, &ext_80387_constants_table[i]);
8275 ext_80387_constants_init = 1;
8278 /* Return non-zero if the constant is something that
8279 can be loaded with a special instruction. */
8282 standard_80387_constant_p (rtx x)
8284 enum machine_mode mode = GET_MODE (x);
8286 REAL_VALUE_TYPE r;
8288 if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8289 return -1;
8291 if (x == CONST0_RTX (mode))
8292 return 1;
8293 if (x == CONST1_RTX (mode))
8294 return 2;
8296 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8298 /* For XFmode constants, try to find a special 80387 instruction when
8299 optimizing for size or on those CPUs that benefit from them. */
8300 if (mode == XFmode
8301 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8303 int i;
8305 if (! ext_80387_constants_init)
8306 init_ext_80387_constants ();
8308 for (i = 0; i < 5; i++)
8309 if (real_identical (&r, &ext_80387_constants_table[i]))
8310 return i + 3;
8313 /* Load of the constant -0.0 or -1.0 will be split as
8314 fldz;fchs or fld1;fchs sequence. */
8315 if (real_isnegzero (&r))
8316 return 8;
8317 if (real_identical (&r, &dconstm1))
8318 return 9;
8320 return 0;
8323 /* Return the opcode of the special instruction to be used to load
8324 the constant X. */
8326 const char *
8327 standard_80387_constant_opcode (rtx x)
8329 switch (standard_80387_constant_p (x))
8331 case 1:
8332 return "fldz";
8333 case 2:
8334 return "fld1";
8335 case 3:
8336 return "fldlg2";
8337 case 4:
8338 return "fldln2";
8339 case 5:
8340 return "fldl2e";
8341 case 6:
8342 return "fldl2t";
8343 case 7:
8344 return "fldpi";
8345 case 8:
8346 case 9:
8347 return "#";
8348 default:
8349 gcc_unreachable ();
8353 /* Return the CONST_DOUBLE representing the 80387 constant that is
8354 loaded by the specified special instruction. The argument IDX
8355 matches the return value from standard_80387_constant_p. */
8358 standard_80387_constant_rtx (int idx)
8360 int i;
8362 if (! ext_80387_constants_init)
8363 init_ext_80387_constants ();
8365 switch (idx)
8367 case 3:
8368 case 4:
8369 case 5:
8370 case 6:
8371 case 7:
8372 i = idx - 3;
8373 break;
8375 default:
8376 gcc_unreachable ();
8379 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8380 XFmode);
8383 /* Return 1 if X is all 0s and 2 if x is all 1s
8384 in supported SSE/AVX vector mode. */
8387 standard_sse_constant_p (rtx x)
8389 enum machine_mode mode = GET_MODE (x);
8391 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8392 return 1;
8393 if (vector_all_ones_operand (x, mode))
8394 switch (mode)
8396 case V16QImode:
8397 case V8HImode:
8398 case V4SImode:
8399 case V2DImode:
8400 if (TARGET_SSE2)
8401 return 2;
8402 case V32QImode:
8403 case V16HImode:
8404 case V8SImode:
8405 case V4DImode:
8406 if (TARGET_AVX2)
8407 return 2;
8408 default:
8409 break;
8412 return 0;
8415 /* Return the opcode of the special instruction to be used to load
8416 the constant X. */
8418 const char *
8419 standard_sse_constant_opcode (rtx insn, rtx x)
8421 switch (standard_sse_constant_p (x))
8423 case 1:
8424 switch (get_attr_mode (insn))
8426 case MODE_TI:
8427 return "%vpxor\t%0, %d0";
8428 case MODE_V2DF:
8429 return "%vxorpd\t%0, %d0";
8430 case MODE_V4SF:
8431 return "%vxorps\t%0, %d0";
8433 case MODE_OI:
8434 return "vpxor\t%x0, %x0, %x0";
8435 case MODE_V4DF:
8436 return "vxorpd\t%x0, %x0, %x0";
8437 case MODE_V8SF:
8438 return "vxorps\t%x0, %x0, %x0";
8440 default:
8441 break;
8444 case 2:
8445 if (TARGET_AVX)
8446 return "vpcmpeqd\t%0, %0, %0";
8447 else
8448 return "pcmpeqd\t%0, %0";
8450 default:
8451 break;
8453 gcc_unreachable ();
8456 /* Returns true if OP contains a symbol reference */
8458 bool
8459 symbolic_reference_mentioned_p (rtx op)
8461 const char *fmt;
8462 int i;
8464 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8465 return true;
8467 fmt = GET_RTX_FORMAT (GET_CODE (op));
8468 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8470 if (fmt[i] == 'E')
8472 int j;
8474 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8475 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8476 return true;
8479 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8480 return true;
8483 return false;
8486 /* Return true if it is appropriate to emit `ret' instructions in the
8487 body of a function. Do this only if the epilogue is simple, needing a
8488 couple of insns. Prior to reloading, we can't tell how many registers
8489 must be saved, so return false then. Return false if there is no frame
8490 marker to de-allocate. */
8492 bool
8493 ix86_can_use_return_insn_p (void)
8495 struct ix86_frame frame;
8497 if (! reload_completed || frame_pointer_needed)
8498 return 0;
8500 /* Don't allow more than 32k pop, since that's all we can do
8501 with one instruction. */
8502 if (crtl->args.pops_args && crtl->args.size >= 32768)
8503 return 0;
8505 ix86_compute_frame_layout (&frame);
8506 return (frame.stack_pointer_offset == UNITS_PER_WORD
8507 && (frame.nregs + frame.nsseregs) == 0);
8510 /* Value should be nonzero if functions must have frame pointers.
8511 Zero means the frame pointer need not be set up (and parms may
8512 be accessed via the stack pointer) in functions that seem suitable. */
8514 static bool
8515 ix86_frame_pointer_required (void)
8517 /* If we accessed previous frames, then the generated code expects
8518 to be able to access the saved ebp value in our frame. */
8519 if (cfun->machine->accesses_prev_frame)
8520 return true;
8522 /* Several x86 os'es need a frame pointer for other reasons,
8523 usually pertaining to setjmp. */
8524 if (SUBTARGET_FRAME_POINTER_REQUIRED)
8525 return true;
8527 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
8528 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8529 return true;
8531 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
8532 allocation is 4GB. */
8533 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
8534 return true;
8536 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8537 turns off the frame pointer by default. Turn it back on now if
8538 we've not got a leaf function. */
8539 if (TARGET_OMIT_LEAF_FRAME_POINTER
8540 && (!crtl->is_leaf
8541 || ix86_current_function_calls_tls_descriptor))
8542 return true;
8544 if (crtl->profile && !flag_fentry)
8545 return true;
8547 return false;
8550 /* Record that the current function accesses previous call frames. */
8552 void
8553 ix86_setup_frame_addresses (void)
8555 cfun->machine->accesses_prev_frame = 1;
8558 #ifndef USE_HIDDEN_LINKONCE
8559 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8560 # define USE_HIDDEN_LINKONCE 1
8561 # else
8562 # define USE_HIDDEN_LINKONCE 0
8563 # endif
8564 #endif
8566 static int pic_labels_used;
8568 /* Fills in the label name that should be used for a pc thunk for
8569 the given register. */
8571 static void
8572 get_pc_thunk_name (char name[32], unsigned int regno)
8574 gcc_assert (!TARGET_64BIT);
8576 if (USE_HIDDEN_LINKONCE)
8577 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
8578 else
8579 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8583 /* This function generates code for -fpic that loads %ebx with
8584 the return address of the caller and then returns. */
8586 static void
8587 ix86_code_end (void)
8589 rtx xops[2];
8590 int regno;
8592 for (regno = AX_REG; regno <= SP_REG; regno++)
8594 char name[32];
8595 tree decl;
8597 if (!(pic_labels_used & (1 << regno)))
8598 continue;
8600 get_pc_thunk_name (name, regno);
8602 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
8603 get_identifier (name),
8604 build_function_type_list (void_type_node, NULL_TREE));
8605 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
8606 NULL_TREE, void_type_node);
8607 TREE_PUBLIC (decl) = 1;
8608 TREE_STATIC (decl) = 1;
8609 DECL_IGNORED_P (decl) = 1;
8611 #if TARGET_MACHO
8612 if (TARGET_MACHO)
8614 switch_to_section (darwin_sections[text_coal_section]);
8615 fputs ("\t.weak_definition\t", asm_out_file);
8616 assemble_name (asm_out_file, name);
8617 fputs ("\n\t.private_extern\t", asm_out_file);
8618 assemble_name (asm_out_file, name);
8619 putc ('\n', asm_out_file);
8620 ASM_OUTPUT_LABEL (asm_out_file, name);
8621 DECL_WEAK (decl) = 1;
8623 else
8624 #endif
8625 if (USE_HIDDEN_LINKONCE)
8627 DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
8629 targetm.asm_out.unique_section (decl, 0);
8630 switch_to_section (get_named_section (decl, NULL, 0));
8632 targetm.asm_out.globalize_label (asm_out_file, name);
8633 fputs ("\t.hidden\t", asm_out_file);
8634 assemble_name (asm_out_file, name);
8635 putc ('\n', asm_out_file);
8636 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
8638 else
8640 switch_to_section (text_section);
8641 ASM_OUTPUT_LABEL (asm_out_file, name);
8644 DECL_INITIAL (decl) = make_node (BLOCK);
8645 current_function_decl = decl;
8646 init_function_start (decl);
8647 first_function_block_is_cold = false;
8648 /* Make sure unwind info is emitted for the thunk if needed. */
8649 final_start_function (emit_barrier (), asm_out_file, 1);
8651 /* Pad stack IP move with 4 instructions (two NOPs count
8652 as one instruction). */
8653 if (TARGET_PAD_SHORT_FUNCTION)
8655 int i = 8;
8657 while (i--)
8658 fputs ("\tnop\n", asm_out_file);
8661 xops[0] = gen_rtx_REG (Pmode, regno);
8662 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
8663 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
8664 fputs ("\tret\n", asm_out_file);
8665 final_end_function ();
8666 init_insn_lengths ();
8667 free_after_compilation (cfun);
8668 set_cfun (NULL);
8669 current_function_decl = NULL;
8672 if (flag_split_stack)
8673 file_end_indicate_split_stack ();
8676 /* Emit code for the SET_GOT patterns. */
8678 const char *
8679 output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
8681 rtx xops[3];
8683 xops[0] = dest;
8685 if (TARGET_VXWORKS_RTP && flag_pic)
8687 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
8688 xops[2] = gen_rtx_MEM (Pmode,
8689 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
8690 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
8692 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
8693 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
8694 an unadorned address. */
8695 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
8696 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
8697 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
8698 return "";
8701 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
8703 if (!flag_pic)
8705 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
8707 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
8709 #if TARGET_MACHO
8710 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8711 is what will be referenced by the Mach-O PIC subsystem. */
8712 if (!label)
8713 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8714 #endif
8716 targetm.asm_out.internal_label (asm_out_file, "L",
8717 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
8719 else
8721 char name[32];
8722 get_pc_thunk_name (name, REGNO (dest));
8723 pic_labels_used |= 1 << REGNO (dest);
8725 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
8726 xops[2] = gen_rtx_MEM (QImode, xops[2]);
8727 output_asm_insn ("call\t%X2", xops);
8728 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8729 is what will be referenced by the Mach-O PIC subsystem. */
8730 #if TARGET_MACHO
8731 if (!label)
8732 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8733 else
8734 targetm.asm_out.internal_label (asm_out_file, "L",
8735 CODE_LABEL_NUMBER (label));
8736 #endif
8739 if (!TARGET_MACHO)
8740 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
8742 return "";
8745 /* Generate an "push" pattern for input ARG. */
8747 static rtx
8748 gen_push (rtx arg)
8750 struct machine_function *m = cfun->machine;
8752 if (m->fs.cfa_reg == stack_pointer_rtx)
8753 m->fs.cfa_offset += UNITS_PER_WORD;
8754 m->fs.sp_offset += UNITS_PER_WORD;
8756 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8757 arg = gen_rtx_REG (word_mode, REGNO (arg));
8759 return gen_rtx_SET (VOIDmode,
8760 gen_rtx_MEM (word_mode,
8761 gen_rtx_PRE_DEC (Pmode,
8762 stack_pointer_rtx)),
8763 arg);
8766 /* Generate an "pop" pattern for input ARG. */
8768 static rtx
8769 gen_pop (rtx arg)
8771 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8772 arg = gen_rtx_REG (word_mode, REGNO (arg));
8774 return gen_rtx_SET (VOIDmode,
8775 arg,
8776 gen_rtx_MEM (word_mode,
8777 gen_rtx_POST_INC (Pmode,
8778 stack_pointer_rtx)));
8781 /* Return >= 0 if there is an unused call-clobbered register available
8782 for the entire function. */
8784 static unsigned int
8785 ix86_select_alt_pic_regnum (void)
8787 if (crtl->is_leaf
8788 && !crtl->profile
8789 && !ix86_current_function_calls_tls_descriptor)
8791 int i, drap;
8792 /* Can't use the same register for both PIC and DRAP. */
8793 if (crtl->drap_reg)
8794 drap = REGNO (crtl->drap_reg);
8795 else
8796 drap = -1;
8797 for (i = 2; i >= 0; --i)
8798 if (i != drap && !df_regs_ever_live_p (i))
8799 return i;
8802 return INVALID_REGNUM;
8805 /* Return TRUE if we need to save REGNO. */
8807 static bool
8808 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
8810 if (pic_offset_table_rtx
8811 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
8812 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
8813 || crtl->profile
8814 || crtl->calls_eh_return
8815 || crtl->uses_const_pool))
8816 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
8818 if (crtl->calls_eh_return && maybe_eh_return)
8820 unsigned i;
8821 for (i = 0; ; i++)
8823 unsigned test = EH_RETURN_DATA_REGNO (i);
8824 if (test == INVALID_REGNUM)
8825 break;
8826 if (test == regno)
8827 return true;
8831 if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
8832 return true;
8834 return (df_regs_ever_live_p (regno)
8835 && !call_used_regs[regno]
8836 && !fixed_regs[regno]
8837 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
8840 /* Return number of saved general prupose registers. */
8842 static int
8843 ix86_nsaved_regs (void)
8845 int nregs = 0;
8846 int regno;
8848 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8849 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8850 nregs ++;
8851 return nregs;
8854 /* Return number of saved SSE registrers. */
8856 static int
8857 ix86_nsaved_sseregs (void)
8859 int nregs = 0;
8860 int regno;
8862 if (!TARGET_64BIT_MS_ABI)
8863 return 0;
8864 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8865 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8866 nregs ++;
8867 return nregs;
8870 /* Given FROM and TO register numbers, say whether this elimination is
8871 allowed. If stack alignment is needed, we can only replace argument
8872 pointer with hard frame pointer, or replace frame pointer with stack
8873 pointer. Otherwise, frame pointer elimination is automatically
8874 handled and all other eliminations are valid. */
8876 static bool
8877 ix86_can_eliminate (const int from, const int to)
8879 if (stack_realign_fp)
8880 return ((from == ARG_POINTER_REGNUM
8881 && to == HARD_FRAME_POINTER_REGNUM)
8882 || (from == FRAME_POINTER_REGNUM
8883 && to == STACK_POINTER_REGNUM));
8884 else
8885 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
8888 /* Return the offset between two registers, one to be eliminated, and the other
8889 its replacement, at the start of a routine. */
8891 HOST_WIDE_INT
8892 ix86_initial_elimination_offset (int from, int to)
8894 struct ix86_frame frame;
8895 ix86_compute_frame_layout (&frame);
8897 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
8898 return frame.hard_frame_pointer_offset;
8899 else if (from == FRAME_POINTER_REGNUM
8900 && to == HARD_FRAME_POINTER_REGNUM)
8901 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
8902 else
8904 gcc_assert (to == STACK_POINTER_REGNUM);
8906 if (from == ARG_POINTER_REGNUM)
8907 return frame.stack_pointer_offset;
8909 gcc_assert (from == FRAME_POINTER_REGNUM);
8910 return frame.stack_pointer_offset - frame.frame_pointer_offset;
8914 /* In a dynamically-aligned function, we can't know the offset from
8915 stack pointer to frame pointer, so we must ensure that setjmp
8916 eliminates fp against the hard fp (%ebp) rather than trying to
8917 index from %esp up to the top of the frame across a gap that is
8918 of unknown (at compile-time) size. */
8919 static rtx
8920 ix86_builtin_setjmp_frame_value (void)
8922 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
8925 /* When using -fsplit-stack, the allocation routines set a field in
8926 the TCB to the bottom of the stack plus this much space, measured
8927 in bytes. */
8929 #define SPLIT_STACK_AVAILABLE 256
8931 /* Fill structure ix86_frame about frame of currently computed function. */
8933 static void
8934 ix86_compute_frame_layout (struct ix86_frame *frame)
8936 unsigned HOST_WIDE_INT stack_alignment_needed;
8937 HOST_WIDE_INT offset;
8938 unsigned HOST_WIDE_INT preferred_alignment;
8939 HOST_WIDE_INT size = get_frame_size ();
8940 HOST_WIDE_INT to_allocate;
8942 frame->nregs = ix86_nsaved_regs ();
8943 frame->nsseregs = ix86_nsaved_sseregs ();
8945 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
8946 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
8948 /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
8949 function prologues and leaf. */
8950 if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
8951 && (!crtl->is_leaf || cfun->calls_alloca != 0
8952 || ix86_current_function_calls_tls_descriptor))
8954 preferred_alignment = 16;
8955 stack_alignment_needed = 16;
8956 crtl->preferred_stack_boundary = 128;
8957 crtl->stack_alignment_needed = 128;
8960 gcc_assert (!size || stack_alignment_needed);
8961 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
8962 gcc_assert (preferred_alignment <= stack_alignment_needed);
8964 /* For SEH we have to limit the amount of code movement into the prologue.
8965 At present we do this via a BLOCKAGE, at which point there's very little
8966 scheduling that can be done, which means that there's very little point
8967 in doing anything except PUSHs. */
8968 if (TARGET_SEH)
8969 cfun->machine->use_fast_prologue_epilogue = false;
8971 /* During reload iteration the amount of registers saved can change.
8972 Recompute the value as needed. Do not recompute when amount of registers
8973 didn't change as reload does multiple calls to the function and does not
8974 expect the decision to change within single iteration. */
8975 else if (!optimize_function_for_size_p (cfun)
8976 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
8978 int count = frame->nregs;
8979 struct cgraph_node *node = cgraph_get_node (current_function_decl);
8981 cfun->machine->use_fast_prologue_epilogue_nregs = count;
8983 /* The fast prologue uses move instead of push to save registers. This
8984 is significantly longer, but also executes faster as modern hardware
8985 can execute the moves in parallel, but can't do that for push/pop.
8987 Be careful about choosing what prologue to emit: When function takes
8988 many instructions to execute we may use slow version as well as in
8989 case function is known to be outside hot spot (this is known with
8990 feedback only). Weight the size of function by number of registers
8991 to save as it is cheap to use one or two push instructions but very
8992 slow to use many of them. */
8993 if (count)
8994 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
8995 if (node->frequency < NODE_FREQUENCY_NORMAL
8996 || (flag_branch_probabilities
8997 && node->frequency < NODE_FREQUENCY_HOT))
8998 cfun->machine->use_fast_prologue_epilogue = false;
8999 else
9000 cfun->machine->use_fast_prologue_epilogue
9001 = !expensive_function_p (count);
9004 frame->save_regs_using_mov
9005 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
9006 /* If static stack checking is enabled and done with probes,
9007 the registers need to be saved before allocating the frame. */
9008 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
9010 /* Skip return address. */
9011 offset = UNITS_PER_WORD;
9013 /* Skip pushed static chain. */
9014 if (ix86_static_chain_on_stack)
9015 offset += UNITS_PER_WORD;
9017 /* Skip saved base pointer. */
9018 if (frame_pointer_needed)
9019 offset += UNITS_PER_WORD;
9020 frame->hfp_save_offset = offset;
9022 /* The traditional frame pointer location is at the top of the frame. */
9023 frame->hard_frame_pointer_offset = offset;
9025 /* Register save area */
9026 offset += frame->nregs * UNITS_PER_WORD;
9027 frame->reg_save_offset = offset;
9029 /* On SEH target, registers are pushed just before the frame pointer
9030 location. */
9031 if (TARGET_SEH)
9032 frame->hard_frame_pointer_offset = offset;
9034 /* Align and set SSE register save area. */
9035 if (frame->nsseregs)
9037 /* The only ABI that has saved SSE registers (Win64) also has a
9038 16-byte aligned default stack, and thus we don't need to be
9039 within the re-aligned local stack frame to save them. */
9040 gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
9041 offset = (offset + 16 - 1) & -16;
9042 offset += frame->nsseregs * 16;
9044 frame->sse_reg_save_offset = offset;
9046 /* The re-aligned stack starts here. Values before this point are not
9047 directly comparable with values below this point. In order to make
9048 sure that no value happens to be the same before and after, force
9049 the alignment computation below to add a non-zero value. */
9050 if (stack_realign_fp)
9051 offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
9053 /* Va-arg area */
9054 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
9055 offset += frame->va_arg_size;
9057 /* Align start of frame for local function. */
9058 if (stack_realign_fp
9059 || offset != frame->sse_reg_save_offset
9060 || size != 0
9061 || !crtl->is_leaf
9062 || cfun->calls_alloca
9063 || ix86_current_function_calls_tls_descriptor)
9064 offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
9066 /* Frame pointer points here. */
9067 frame->frame_pointer_offset = offset;
9069 offset += size;
9071 /* Add outgoing arguments area. Can be skipped if we eliminated
9072 all the function calls as dead code.
9073 Skipping is however impossible when function calls alloca. Alloca
9074 expander assumes that last crtl->outgoing_args_size
9075 of stack frame are unused. */
9076 if (ACCUMULATE_OUTGOING_ARGS
9077 && (!crtl->is_leaf || cfun->calls_alloca
9078 || ix86_current_function_calls_tls_descriptor))
9080 offset += crtl->outgoing_args_size;
9081 frame->outgoing_arguments_size = crtl->outgoing_args_size;
9083 else
9084 frame->outgoing_arguments_size = 0;
9086 /* Align stack boundary. Only needed if we're calling another function
9087 or using alloca. */
9088 if (!crtl->is_leaf || cfun->calls_alloca
9089 || ix86_current_function_calls_tls_descriptor)
9090 offset = (offset + preferred_alignment - 1) & -preferred_alignment;
9092 /* We've reached end of stack frame. */
9093 frame->stack_pointer_offset = offset;
9095 /* Size prologue needs to allocate. */
9096 to_allocate = offset - frame->sse_reg_save_offset;
9098 if ((!to_allocate && frame->nregs <= 1)
9099 || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
9100 frame->save_regs_using_mov = false;
9102 if (ix86_using_red_zone ()
9103 && crtl->sp_is_unchanging
9104 && crtl->is_leaf
9105 && !ix86_current_function_calls_tls_descriptor)
9107 frame->red_zone_size = to_allocate;
9108 if (frame->save_regs_using_mov)
9109 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
9110 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
9111 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
9113 else
9114 frame->red_zone_size = 0;
9115 frame->stack_pointer_offset -= frame->red_zone_size;
9117 /* The SEH frame pointer location is near the bottom of the frame.
9118 This is enforced by the fact that the difference between the
9119 stack pointer and the frame pointer is limited to 240 bytes in
9120 the unwind data structure. */
9121 if (TARGET_SEH)
9123 HOST_WIDE_INT diff;
9125 /* If we can leave the frame pointer where it is, do so. Also, returns
9126 the establisher frame for __builtin_frame_address (0). */
9127 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
9128 if (diff <= SEH_MAX_FRAME_SIZE
9129 && (diff > 240 || (diff & 15) != 0)
9130 && !crtl->accesses_prior_frames)
9132 /* Ideally we'd determine what portion of the local stack frame
9133 (within the constraint of the lowest 240) is most heavily used.
9134 But without that complication, simply bias the frame pointer
9135 by 128 bytes so as to maximize the amount of the local stack
9136 frame that is addressable with 8-bit offsets. */
9137 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
9142 /* This is semi-inlined memory_address_length, but simplified
9143 since we know that we're always dealing with reg+offset, and
9144 to avoid having to create and discard all that rtl. */
9146 static inline int
9147 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
9149 int len = 4;
9151 if (offset == 0)
9153 /* EBP and R13 cannot be encoded without an offset. */
9154 len = (regno == BP_REG || regno == R13_REG);
9156 else if (IN_RANGE (offset, -128, 127))
9157 len = 1;
9159 /* ESP and R12 must be encoded with a SIB byte. */
9160 if (regno == SP_REG || regno == R12_REG)
9161 len++;
9163 return len;
9166 /* Return an RTX that points to CFA_OFFSET within the stack frame.
9167 The valid base registers are taken from CFUN->MACHINE->FS. */
9169 static rtx
9170 choose_baseaddr (HOST_WIDE_INT cfa_offset)
9172 const struct machine_function *m = cfun->machine;
9173 rtx base_reg = NULL;
9174 HOST_WIDE_INT base_offset = 0;
9176 if (m->use_fast_prologue_epilogue)
9178 /* Choose the base register most likely to allow the most scheduling
9179 opportunities. Generally FP is valid throughout the function,
9180 while DRAP must be reloaded within the epilogue. But choose either
9181 over the SP due to increased encoding size. */
9183 if (m->fs.fp_valid)
9185 base_reg = hard_frame_pointer_rtx;
9186 base_offset = m->fs.fp_offset - cfa_offset;
9188 else if (m->fs.drap_valid)
9190 base_reg = crtl->drap_reg;
9191 base_offset = 0 - cfa_offset;
9193 else if (m->fs.sp_valid)
9195 base_reg = stack_pointer_rtx;
9196 base_offset = m->fs.sp_offset - cfa_offset;
9199 else
9201 HOST_WIDE_INT toffset;
9202 int len = 16, tlen;
9204 /* Choose the base register with the smallest address encoding.
9205 With a tie, choose FP > DRAP > SP. */
9206 if (m->fs.sp_valid)
9208 base_reg = stack_pointer_rtx;
9209 base_offset = m->fs.sp_offset - cfa_offset;
9210 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
9212 if (m->fs.drap_valid)
9214 toffset = 0 - cfa_offset;
9215 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
9216 if (tlen <= len)
9218 base_reg = crtl->drap_reg;
9219 base_offset = toffset;
9220 len = tlen;
9223 if (m->fs.fp_valid)
9225 toffset = m->fs.fp_offset - cfa_offset;
9226 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
9227 if (tlen <= len)
9229 base_reg = hard_frame_pointer_rtx;
9230 base_offset = toffset;
9231 len = tlen;
9235 gcc_assert (base_reg != NULL);
9237 return plus_constant (Pmode, base_reg, base_offset);
9240 /* Emit code to save registers in the prologue. */
9242 static void
9243 ix86_emit_save_regs (void)
9245 unsigned int regno;
9246 rtx insn;
9248 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9249 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9251 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
9252 RTX_FRAME_RELATED_P (insn) = 1;
9256 /* Emit a single register save at CFA - CFA_OFFSET. */
9258 static void
9259 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9260 HOST_WIDE_INT cfa_offset)
9262 struct machine_function *m = cfun->machine;
9263 rtx reg = gen_rtx_REG (mode, regno);
9264 rtx mem, addr, base, insn;
9266 addr = choose_baseaddr (cfa_offset);
9267 mem = gen_frame_mem (mode, addr);
9269 /* For SSE saves, we need to indicate the 128-bit alignment. */
9270 set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9272 insn = emit_move_insn (mem, reg);
9273 RTX_FRAME_RELATED_P (insn) = 1;
9275 base = addr;
9276 if (GET_CODE (base) == PLUS)
9277 base = XEXP (base, 0);
9278 gcc_checking_assert (REG_P (base));
9280 /* When saving registers into a re-aligned local stack frame, avoid
9281 any tricky guessing by dwarf2out. */
9282 if (m->fs.realigned)
9284 gcc_checking_assert (stack_realign_drap);
9286 if (regno == REGNO (crtl->drap_reg))
9288 /* A bit of a hack. We force the DRAP register to be saved in
9289 the re-aligned stack frame, which provides us with a copy
9290 of the CFA that will last past the prologue. Install it. */
9291 gcc_checking_assert (cfun->machine->fs.fp_valid);
9292 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9293 cfun->machine->fs.fp_offset - cfa_offset);
9294 mem = gen_rtx_MEM (mode, addr);
9295 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9297 else
9299 /* The frame pointer is a stable reference within the
9300 aligned frame. Use it. */
9301 gcc_checking_assert (cfun->machine->fs.fp_valid);
9302 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9303 cfun->machine->fs.fp_offset - cfa_offset);
9304 mem = gen_rtx_MEM (mode, addr);
9305 add_reg_note (insn, REG_CFA_EXPRESSION,
9306 gen_rtx_SET (VOIDmode, mem, reg));
9310 /* The memory may not be relative to the current CFA register,
9311 which means that we may need to generate a new pattern for
9312 use by the unwind info. */
9313 else if (base != m->fs.cfa_reg)
9315 addr = plus_constant (Pmode, m->fs.cfa_reg,
9316 m->fs.cfa_offset - cfa_offset);
9317 mem = gen_rtx_MEM (mode, addr);
9318 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9322 /* Emit code to save registers using MOV insns.
9323 First register is stored at CFA - CFA_OFFSET. */
9324 static void
9325 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9327 unsigned int regno;
9329 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9330 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9332 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
9333 cfa_offset -= UNITS_PER_WORD;
9337 /* Emit code to save SSE registers using MOV insns.
9338 First register is stored at CFA - CFA_OFFSET. */
9339 static void
9340 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9342 unsigned int regno;
9344 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9345 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9347 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9348 cfa_offset -= 16;
9352 static GTY(()) rtx queued_cfa_restores;
9354 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9355 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
9356 Don't add the note if the previously saved value will be left untouched
9357 within stack red-zone till return, as unwinders can find the same value
9358 in the register and on the stack. */
9360 static void
9361 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9363 if (!crtl->shrink_wrapped
9364 && cfa_offset <= cfun->machine->fs.red_zone_offset)
9365 return;
9367 if (insn)
9369 add_reg_note (insn, REG_CFA_RESTORE, reg);
9370 RTX_FRAME_RELATED_P (insn) = 1;
9372 else
9373 queued_cfa_restores
9374 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9377 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
9379 static void
9380 ix86_add_queued_cfa_restore_notes (rtx insn)
9382 rtx last;
9383 if (!queued_cfa_restores)
9384 return;
9385 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9387 XEXP (last, 1) = REG_NOTES (insn);
9388 REG_NOTES (insn) = queued_cfa_restores;
9389 queued_cfa_restores = NULL_RTX;
9390 RTX_FRAME_RELATED_P (insn) = 1;
9393 /* Expand prologue or epilogue stack adjustment.
9394 The pattern exist to put a dependency on all ebp-based memory accesses.
9395 STYLE should be negative if instructions should be marked as frame related,
9396 zero if %r11 register is live and cannot be freely used and positive
9397 otherwise. */
9399 static void
9400 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9401 int style, bool set_cfa)
9403 struct machine_function *m = cfun->machine;
9404 rtx insn;
9405 bool add_frame_related_expr = false;
9407 if (Pmode == SImode)
9408 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9409 else if (x86_64_immediate_operand (offset, DImode))
9410 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9411 else
9413 rtx tmp;
9414 /* r11 is used by indirect sibcall return as well, set before the
9415 epilogue and used after the epilogue. */
9416 if (style)
9417 tmp = gen_rtx_REG (DImode, R11_REG);
9418 else
9420 gcc_assert (src != hard_frame_pointer_rtx
9421 && dest != hard_frame_pointer_rtx);
9422 tmp = hard_frame_pointer_rtx;
9424 insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9425 if (style < 0)
9426 add_frame_related_expr = true;
9428 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9431 insn = emit_insn (insn);
9432 if (style >= 0)
9433 ix86_add_queued_cfa_restore_notes (insn);
9435 if (set_cfa)
9437 rtx r;
9439 gcc_assert (m->fs.cfa_reg == src);
9440 m->fs.cfa_offset += INTVAL (offset);
9441 m->fs.cfa_reg = dest;
9443 r = gen_rtx_PLUS (Pmode, src, offset);
9444 r = gen_rtx_SET (VOIDmode, dest, r);
9445 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9446 RTX_FRAME_RELATED_P (insn) = 1;
9448 else if (style < 0)
9450 RTX_FRAME_RELATED_P (insn) = 1;
9451 if (add_frame_related_expr)
9453 rtx r = gen_rtx_PLUS (Pmode, src, offset);
9454 r = gen_rtx_SET (VOIDmode, dest, r);
9455 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9459 if (dest == stack_pointer_rtx)
9461 HOST_WIDE_INT ooffset = m->fs.sp_offset;
9462 bool valid = m->fs.sp_valid;
9464 if (src == hard_frame_pointer_rtx)
9466 valid = m->fs.fp_valid;
9467 ooffset = m->fs.fp_offset;
9469 else if (src == crtl->drap_reg)
9471 valid = m->fs.drap_valid;
9472 ooffset = 0;
9474 else
9476 /* Else there are two possibilities: SP itself, which we set
9477 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
9478 taken care of this by hand along the eh_return path. */
9479 gcc_checking_assert (src == stack_pointer_rtx
9480 || offset == const0_rtx);
9483 m->fs.sp_offset = ooffset - INTVAL (offset);
9484 m->fs.sp_valid = valid;
9488 /* Find an available register to be used as dynamic realign argument
9489 pointer regsiter. Such a register will be written in prologue and
9490 used in begin of body, so it must not be
9491 1. parameter passing register.
9492 2. GOT pointer.
9493 We reuse static-chain register if it is available. Otherwise, we
9494 use DI for i386 and R13 for x86-64. We chose R13 since it has
9495 shorter encoding.
9497 Return: the regno of chosen register. */
9499 static unsigned int
9500 find_drap_reg (void)
9502 tree decl = cfun->decl;
9504 if (TARGET_64BIT)
9506 /* Use R13 for nested function or function need static chain.
9507 Since function with tail call may use any caller-saved
9508 registers in epilogue, DRAP must not use caller-saved
9509 register in such case. */
9510 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9511 return R13_REG;
9513 return R10_REG;
9515 else
9517 /* Use DI for nested function or function need static chain.
9518 Since function with tail call may use any caller-saved
9519 registers in epilogue, DRAP must not use caller-saved
9520 register in such case. */
9521 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9522 return DI_REG;
9524 /* Reuse static chain register if it isn't used for parameter
9525 passing. */
9526 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9528 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9529 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9530 return CX_REG;
9532 return DI_REG;
9536 /* Return minimum incoming stack alignment. */
9538 static unsigned int
9539 ix86_minimum_incoming_stack_boundary (bool sibcall)
9541 unsigned int incoming_stack_boundary;
9543 /* Prefer the one specified at command line. */
9544 if (ix86_user_incoming_stack_boundary)
9545 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9546 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9547 if -mstackrealign is used, it isn't used for sibcall check and
9548 estimated stack alignment is 128bit. */
9549 else if (!sibcall
9550 && !TARGET_64BIT
9551 && ix86_force_align_arg_pointer
9552 && crtl->stack_alignment_estimated == 128)
9553 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9554 else
9555 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9557 /* Incoming stack alignment can be changed on individual functions
9558 via force_align_arg_pointer attribute. We use the smallest
9559 incoming stack boundary. */
9560 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9561 && lookup_attribute (ix86_force_align_arg_pointer_string,
9562 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9563 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9565 /* The incoming stack frame has to be aligned at least at
9566 parm_stack_boundary. */
9567 if (incoming_stack_boundary < crtl->parm_stack_boundary)
9568 incoming_stack_boundary = crtl->parm_stack_boundary;
9570 /* Stack at entrance of main is aligned by runtime. We use the
9571 smallest incoming stack boundary. */
9572 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9573 && DECL_NAME (current_function_decl)
9574 && MAIN_NAME_P (DECL_NAME (current_function_decl))
9575 && DECL_FILE_SCOPE_P (current_function_decl))
9576 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9578 return incoming_stack_boundary;
9581 /* Update incoming stack boundary and estimated stack alignment. */
9583 static void
9584 ix86_update_stack_boundary (void)
9586 ix86_incoming_stack_boundary
9587 = ix86_minimum_incoming_stack_boundary (false);
9589 /* x86_64 vararg needs 16byte stack alignment for register save
9590 area. */
9591 if (TARGET_64BIT
9592 && cfun->stdarg
9593 && crtl->stack_alignment_estimated < 128)
9594 crtl->stack_alignment_estimated = 128;
9597 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
9598 needed or an rtx for DRAP otherwise. */
9600 static rtx
9601 ix86_get_drap_rtx (void)
9603 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
9604 crtl->need_drap = true;
9606 if (stack_realign_drap)
9608 /* Assign DRAP to vDRAP and returns vDRAP */
9609 unsigned int regno = find_drap_reg ();
9610 rtx drap_vreg;
9611 rtx arg_ptr;
9612 rtx seq, insn;
9614 arg_ptr = gen_rtx_REG (Pmode, regno);
9615 crtl->drap_reg = arg_ptr;
9617 start_sequence ();
9618 drap_vreg = copy_to_reg (arg_ptr);
9619 seq = get_insns ();
9620 end_sequence ();
9622 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
9623 if (!optimize)
9625 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
9626 RTX_FRAME_RELATED_P (insn) = 1;
9628 return drap_vreg;
9630 else
9631 return NULL;
9634 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
9636 static rtx
9637 ix86_internal_arg_pointer (void)
9639 return virtual_incoming_args_rtx;
9642 struct scratch_reg {
9643 rtx reg;
9644 bool saved;
9647 /* Return a short-lived scratch register for use on function entry.
9648 In 32-bit mode, it is valid only after the registers are saved
9649 in the prologue. This register must be released by means of
9650 release_scratch_register_on_entry once it is dead. */
9652 static void
9653 get_scratch_register_on_entry (struct scratch_reg *sr)
9655 int regno;
9657 sr->saved = false;
9659 if (TARGET_64BIT)
9661 /* We always use R11 in 64-bit mode. */
9662 regno = R11_REG;
9664 else
9666 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
9667 bool fastcall_p
9668 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9669 bool thiscall_p
9670 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9671 bool static_chain_p = DECL_STATIC_CHAIN (decl);
9672 int regparm = ix86_function_regparm (fntype, decl);
9673 int drap_regno
9674 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
9676 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
9677 for the static chain register. */
9678 if ((regparm < 1 || (fastcall_p && !static_chain_p))
9679 && drap_regno != AX_REG)
9680 regno = AX_REG;
9681 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
9682 for the static chain register. */
9683 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
9684 regno = AX_REG;
9685 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
9686 regno = DX_REG;
9687 /* ecx is the static chain register. */
9688 else if (regparm < 3 && !fastcall_p && !thiscall_p
9689 && !static_chain_p
9690 && drap_regno != CX_REG)
9691 regno = CX_REG;
9692 else if (ix86_save_reg (BX_REG, true))
9693 regno = BX_REG;
9694 /* esi is the static chain register. */
9695 else if (!(regparm == 3 && static_chain_p)
9696 && ix86_save_reg (SI_REG, true))
9697 regno = SI_REG;
9698 else if (ix86_save_reg (DI_REG, true))
9699 regno = DI_REG;
9700 else
9702 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
9703 sr->saved = true;
9707 sr->reg = gen_rtx_REG (Pmode, regno);
9708 if (sr->saved)
9710 rtx insn = emit_insn (gen_push (sr->reg));
9711 RTX_FRAME_RELATED_P (insn) = 1;
9715 /* Release a scratch register obtained from the preceding function. */
9717 static void
9718 release_scratch_register_on_entry (struct scratch_reg *sr)
9720 if (sr->saved)
9722 struct machine_function *m = cfun->machine;
9723 rtx x, insn = emit_insn (gen_pop (sr->reg));
9725 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
9726 RTX_FRAME_RELATED_P (insn) = 1;
9727 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
9728 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
9729 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
9730 m->fs.sp_offset -= UNITS_PER_WORD;
9734 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
9736 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
9738 static void
9739 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
9741 /* We skip the probe for the first interval + a small dope of 4 words and
9742 probe that many bytes past the specified size to maintain a protection
9743 area at the botton of the stack. */
9744 const int dope = 4 * UNITS_PER_WORD;
9745 rtx size_rtx = GEN_INT (size), last;
9747 /* See if we have a constant small number of probes to generate. If so,
9748 that's the easy case. The run-time loop is made up of 11 insns in the
9749 generic case while the compile-time loop is made up of 3+2*(n-1) insns
9750 for n # of intervals. */
9751 if (size <= 5 * PROBE_INTERVAL)
9753 HOST_WIDE_INT i, adjust;
9754 bool first_probe = true;
9756 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
9757 values of N from 1 until it exceeds SIZE. If only one probe is
9758 needed, this will not generate any code. Then adjust and probe
9759 to PROBE_INTERVAL + SIZE. */
9760 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9762 if (first_probe)
9764 adjust = 2 * PROBE_INTERVAL + dope;
9765 first_probe = false;
9767 else
9768 adjust = PROBE_INTERVAL;
9770 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9771 plus_constant (Pmode, stack_pointer_rtx,
9772 -adjust)));
9773 emit_stack_probe (stack_pointer_rtx);
9776 if (first_probe)
9777 adjust = size + PROBE_INTERVAL + dope;
9778 else
9779 adjust = size + PROBE_INTERVAL - i;
9781 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9782 plus_constant (Pmode, stack_pointer_rtx,
9783 -adjust)));
9784 emit_stack_probe (stack_pointer_rtx);
9786 /* Adjust back to account for the additional first interval. */
9787 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9788 plus_constant (Pmode, stack_pointer_rtx,
9789 PROBE_INTERVAL + dope)));
9792 /* Otherwise, do the same as above, but in a loop. Note that we must be
9793 extra careful with variables wrapping around because we might be at
9794 the very top (or the very bottom) of the address space and we have
9795 to be able to handle this case properly; in particular, we use an
9796 equality test for the loop condition. */
9797 else
9799 HOST_WIDE_INT rounded_size;
9800 struct scratch_reg sr;
9802 get_scratch_register_on_entry (&sr);
9805 /* Step 1: round SIZE to the previous multiple of the interval. */
9807 rounded_size = size & -PROBE_INTERVAL;
9810 /* Step 2: compute initial and final value of the loop counter. */
9812 /* SP = SP_0 + PROBE_INTERVAL. */
9813 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9814 plus_constant (Pmode, stack_pointer_rtx,
9815 - (PROBE_INTERVAL + dope))));
9817 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
9818 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
9819 emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
9820 gen_rtx_PLUS (Pmode, sr.reg,
9821 stack_pointer_rtx)));
9824 /* Step 3: the loop
9826 while (SP != LAST_ADDR)
9828 SP = SP + PROBE_INTERVAL
9829 probe at SP
9832 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
9833 values of N from 1 until it is equal to ROUNDED_SIZE. */
9835 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
9838 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
9839 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
9841 if (size != rounded_size)
9843 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9844 plus_constant (Pmode, stack_pointer_rtx,
9845 rounded_size - size)));
9846 emit_stack_probe (stack_pointer_rtx);
9849 /* Adjust back to account for the additional first interval. */
9850 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9851 plus_constant (Pmode, stack_pointer_rtx,
9852 PROBE_INTERVAL + dope)));
9854 release_scratch_register_on_entry (&sr);
9857 gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
9859 /* Even if the stack pointer isn't the CFA register, we need to correctly
9860 describe the adjustments made to it, in particular differentiate the
9861 frame-related ones from the frame-unrelated ones. */
9862 if (size > 0)
9864 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
9865 XVECEXP (expr, 0, 0)
9866 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9867 plus_constant (Pmode, stack_pointer_rtx, -size));
9868 XVECEXP (expr, 0, 1)
9869 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9870 plus_constant (Pmode, stack_pointer_rtx,
9871 PROBE_INTERVAL + dope + size));
9872 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
9873 RTX_FRAME_RELATED_P (last) = 1;
9875 cfun->machine->fs.sp_offset += size;
9878 /* Make sure nothing is scheduled before we are done. */
9879 emit_insn (gen_blockage ());
9882 /* Adjust the stack pointer up to REG while probing it. */
9884 const char *
9885 output_adjust_stack_and_probe (rtx reg)
9887 static int labelno = 0;
9888 char loop_lab[32], end_lab[32];
9889 rtx xops[2];
9891 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9892 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9894 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9896 /* Jump to END_LAB if SP == LAST_ADDR. */
9897 xops[0] = stack_pointer_rtx;
9898 xops[1] = reg;
9899 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9900 fputs ("\tje\t", asm_out_file);
9901 assemble_name_raw (asm_out_file, end_lab);
9902 fputc ('\n', asm_out_file);
9904 /* SP = SP + PROBE_INTERVAL. */
9905 xops[1] = GEN_INT (PROBE_INTERVAL);
9906 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9908 /* Probe at SP. */
9909 xops[1] = const0_rtx;
9910 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
9912 fprintf (asm_out_file, "\tjmp\t");
9913 assemble_name_raw (asm_out_file, loop_lab);
9914 fputc ('\n', asm_out_file);
9916 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9918 return "";
9921 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
9922 inclusive. These are offsets from the current stack pointer. */
9924 static void
9925 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
9927 /* See if we have a constant small number of probes to generate. If so,
9928 that's the easy case. The run-time loop is made up of 7 insns in the
9929 generic case while the compile-time loop is made up of n insns for n #
9930 of intervals. */
9931 if (size <= 7 * PROBE_INTERVAL)
9933 HOST_WIDE_INT i;
9935 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
9936 it exceeds SIZE. If only one probe is needed, this will not
9937 generate any code. Then probe at FIRST + SIZE. */
9938 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9939 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9940 -(first + i)));
9942 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9943 -(first + size)));
9946 /* Otherwise, do the same as above, but in a loop. Note that we must be
9947 extra careful with variables wrapping around because we might be at
9948 the very top (or the very bottom) of the address space and we have
9949 to be able to handle this case properly; in particular, we use an
9950 equality test for the loop condition. */
9951 else
9953 HOST_WIDE_INT rounded_size, last;
9954 struct scratch_reg sr;
9956 get_scratch_register_on_entry (&sr);
9959 /* Step 1: round SIZE to the previous multiple of the interval. */
9961 rounded_size = size & -PROBE_INTERVAL;
9964 /* Step 2: compute initial and final value of the loop counter. */
9966 /* TEST_OFFSET = FIRST. */
9967 emit_move_insn (sr.reg, GEN_INT (-first));
9969 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
9970 last = first + rounded_size;
9973 /* Step 3: the loop
9975 while (TEST_ADDR != LAST_ADDR)
9977 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
9978 probe at TEST_ADDR
9981 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
9982 until it is equal to ROUNDED_SIZE. */
9984 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
9987 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
9988 that SIZE is equal to ROUNDED_SIZE. */
9990 if (size != rounded_size)
9991 emit_stack_probe (plus_constant (Pmode,
9992 gen_rtx_PLUS (Pmode,
9993 stack_pointer_rtx,
9994 sr.reg),
9995 rounded_size - size));
9997 release_scratch_register_on_entry (&sr);
10000 /* Make sure nothing is scheduled before we are done. */
10001 emit_insn (gen_blockage ());
10004 /* Probe a range of stack addresses from REG to END, inclusive. These are
10005 offsets from the current stack pointer. */
10007 const char *
10008 output_probe_stack_range (rtx reg, rtx end)
10010 static int labelno = 0;
10011 char loop_lab[32], end_lab[32];
10012 rtx xops[3];
10014 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10015 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10017 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10019 /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */
10020 xops[0] = reg;
10021 xops[1] = end;
10022 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10023 fputs ("\tje\t", asm_out_file);
10024 assemble_name_raw (asm_out_file, end_lab);
10025 fputc ('\n', asm_out_file);
10027 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
10028 xops[1] = GEN_INT (PROBE_INTERVAL);
10029 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10031 /* Probe at TEST_ADDR. */
10032 xops[0] = stack_pointer_rtx;
10033 xops[1] = reg;
10034 xops[2] = const0_rtx;
10035 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
10037 fprintf (asm_out_file, "\tjmp\t");
10038 assemble_name_raw (asm_out_file, loop_lab);
10039 fputc ('\n', asm_out_file);
10041 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10043 return "";
10046 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
10047 to be generated in correct form. */
10048 static void
10049 ix86_finalize_stack_realign_flags (void)
10051 /* Check if stack realign is really needed after reload, and
10052 stores result in cfun */
10053 unsigned int incoming_stack_boundary
10054 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
10055 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
10056 unsigned int stack_realign = (incoming_stack_boundary
10057 < (crtl->is_leaf
10058 ? crtl->max_used_stack_slot_alignment
10059 : crtl->stack_alignment_needed));
10061 if (crtl->stack_realign_finalized)
10063 /* After stack_realign_needed is finalized, we can't no longer
10064 change it. */
10065 gcc_assert (crtl->stack_realign_needed == stack_realign);
10066 return;
10069 /* If the only reason for frame_pointer_needed is that we conservatively
10070 assumed stack realignment might be needed, but in the end nothing that
10071 needed the stack alignment had been spilled, clear frame_pointer_needed
10072 and say we don't need stack realignment. */
10073 if (stack_realign
10074 && !crtl->need_drap
10075 && frame_pointer_needed
10076 && crtl->is_leaf
10077 && flag_omit_frame_pointer
10078 && crtl->sp_is_unchanging
10079 && !ix86_current_function_calls_tls_descriptor
10080 && !crtl->accesses_prior_frames
10081 && !cfun->calls_alloca
10082 && !crtl->calls_eh_return
10083 && !(flag_stack_check && STACK_CHECK_MOVING_SP)
10084 && !ix86_frame_pointer_required ()
10085 && get_frame_size () == 0
10086 && ix86_nsaved_sseregs () == 0
10087 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
10089 HARD_REG_SET set_up_by_prologue, prologue_used;
10090 basic_block bb;
10092 CLEAR_HARD_REG_SET (prologue_used);
10093 CLEAR_HARD_REG_SET (set_up_by_prologue);
10094 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
10095 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
10096 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
10097 HARD_FRAME_POINTER_REGNUM);
10098 FOR_EACH_BB (bb)
10100 rtx insn;
10101 FOR_BB_INSNS (bb, insn)
10102 if (NONDEBUG_INSN_P (insn)
10103 && requires_stack_frame_p (insn, prologue_used,
10104 set_up_by_prologue))
10106 crtl->stack_realign_needed = stack_realign;
10107 crtl->stack_realign_finalized = true;
10108 return;
10112 frame_pointer_needed = false;
10113 stack_realign = false;
10114 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
10115 crtl->stack_alignment_needed = incoming_stack_boundary;
10116 crtl->stack_alignment_estimated = incoming_stack_boundary;
10117 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
10118 crtl->preferred_stack_boundary = incoming_stack_boundary;
10119 df_finish_pass (true);
10120 df_scan_alloc (NULL);
10121 df_scan_blocks ();
10122 df_compute_regs_ever_live (true);
10123 df_analyze ();
10126 crtl->stack_realign_needed = stack_realign;
10127 crtl->stack_realign_finalized = true;
10130 /* Expand the prologue into a bunch of separate insns. */
10132 void
10133 ix86_expand_prologue (void)
10135 struct machine_function *m = cfun->machine;
10136 rtx insn, t;
10137 bool pic_reg_used;
10138 struct ix86_frame frame;
10139 HOST_WIDE_INT allocate;
10140 bool int_registers_saved;
10141 bool sse_registers_saved;
10143 ix86_finalize_stack_realign_flags ();
10145 /* DRAP should not coexist with stack_realign_fp */
10146 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
10148 memset (&m->fs, 0, sizeof (m->fs));
10150 /* Initialize CFA state for before the prologue. */
10151 m->fs.cfa_reg = stack_pointer_rtx;
10152 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
10154 /* Track SP offset to the CFA. We continue tracking this after we've
10155 swapped the CFA register away from SP. In the case of re-alignment
10156 this is fudged; we're interested to offsets within the local frame. */
10157 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10158 m->fs.sp_valid = true;
10160 ix86_compute_frame_layout (&frame);
10162 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
10164 /* We should have already generated an error for any use of
10165 ms_hook on a nested function. */
10166 gcc_checking_assert (!ix86_static_chain_on_stack);
10168 /* Check if profiling is active and we shall use profiling before
10169 prologue variant. If so sorry. */
10170 if (crtl->profile && flag_fentry != 0)
10171 sorry ("ms_hook_prologue attribute isn%'t compatible "
10172 "with -mfentry for 32-bit");
10174 /* In ix86_asm_output_function_label we emitted:
10175 8b ff movl.s %edi,%edi
10176 55 push %ebp
10177 8b ec movl.s %esp,%ebp
10179 This matches the hookable function prologue in Win32 API
10180 functions in Microsoft Windows XP Service Pack 2 and newer.
10181 Wine uses this to enable Windows apps to hook the Win32 API
10182 functions provided by Wine.
10184 What that means is that we've already set up the frame pointer. */
10186 if (frame_pointer_needed
10187 && !(crtl->drap_reg && crtl->stack_realign_needed))
10189 rtx push, mov;
10191 /* We've decided to use the frame pointer already set up.
10192 Describe this to the unwinder by pretending that both
10193 push and mov insns happen right here.
10195 Putting the unwind info here at the end of the ms_hook
10196 is done so that we can make absolutely certain we get
10197 the required byte sequence at the start of the function,
10198 rather than relying on an assembler that can produce
10199 the exact encoding required.
10201 However it does mean (in the unpatched case) that we have
10202 a 1 insn window where the asynchronous unwind info is
10203 incorrect. However, if we placed the unwind info at
10204 its correct location we would have incorrect unwind info
10205 in the patched case. Which is probably all moot since
10206 I don't expect Wine generates dwarf2 unwind info for the
10207 system libraries that use this feature. */
10209 insn = emit_insn (gen_blockage ());
10211 push = gen_push (hard_frame_pointer_rtx);
10212 mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
10213 stack_pointer_rtx);
10214 RTX_FRAME_RELATED_P (push) = 1;
10215 RTX_FRAME_RELATED_P (mov) = 1;
10217 RTX_FRAME_RELATED_P (insn) = 1;
10218 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10219 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
10221 /* Note that gen_push incremented m->fs.cfa_offset, even
10222 though we didn't emit the push insn here. */
10223 m->fs.cfa_reg = hard_frame_pointer_rtx;
10224 m->fs.fp_offset = m->fs.cfa_offset;
10225 m->fs.fp_valid = true;
10227 else
10229 /* The frame pointer is not needed so pop %ebp again.
10230 This leaves us with a pristine state. */
10231 emit_insn (gen_pop (hard_frame_pointer_rtx));
10235 /* The first insn of a function that accepts its static chain on the
10236 stack is to push the register that would be filled in by a direct
10237 call. This insn will be skipped by the trampoline. */
10238 else if (ix86_static_chain_on_stack)
10240 insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
10241 emit_insn (gen_blockage ());
10243 /* We don't want to interpret this push insn as a register save,
10244 only as a stack adjustment. The real copy of the register as
10245 a save will be done later, if needed. */
10246 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
10247 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
10248 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10249 RTX_FRAME_RELATED_P (insn) = 1;
10252 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10253 of DRAP is needed and stack realignment is really needed after reload */
10254 if (stack_realign_drap)
10256 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10258 /* Only need to push parameter pointer reg if it is caller saved. */
10259 if (!call_used_regs[REGNO (crtl->drap_reg)])
10261 /* Push arg pointer reg */
10262 insn = emit_insn (gen_push (crtl->drap_reg));
10263 RTX_FRAME_RELATED_P (insn) = 1;
10266 /* Grab the argument pointer. */
10267 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
10268 insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10269 RTX_FRAME_RELATED_P (insn) = 1;
10270 m->fs.cfa_reg = crtl->drap_reg;
10271 m->fs.cfa_offset = 0;
10273 /* Align the stack. */
10274 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10275 stack_pointer_rtx,
10276 GEN_INT (-align_bytes)));
10277 RTX_FRAME_RELATED_P (insn) = 1;
10279 /* Replicate the return address on the stack so that return
10280 address can be reached via (argp - 1) slot. This is needed
10281 to implement macro RETURN_ADDR_RTX and intrinsic function
10282 expand_builtin_return_addr etc. */
10283 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
10284 t = gen_frame_mem (word_mode, t);
10285 insn = emit_insn (gen_push (t));
10286 RTX_FRAME_RELATED_P (insn) = 1;
10288 /* For the purposes of frame and register save area addressing,
10289 we've started over with a new frame. */
10290 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10291 m->fs.realigned = true;
10294 int_registers_saved = (frame.nregs == 0);
10295 sse_registers_saved = (frame.nsseregs == 0);
10297 if (frame_pointer_needed && !m->fs.fp_valid)
10299 /* Note: AT&T enter does NOT have reversed args. Enter is probably
10300 slower on all targets. Also sdb doesn't like it. */
10301 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10302 RTX_FRAME_RELATED_P (insn) = 1;
10304 /* Push registers now, before setting the frame pointer
10305 on SEH target. */
10306 if (!int_registers_saved
10307 && TARGET_SEH
10308 && !frame.save_regs_using_mov)
10310 ix86_emit_save_regs ();
10311 int_registers_saved = true;
10312 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10315 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10317 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10318 RTX_FRAME_RELATED_P (insn) = 1;
10320 if (m->fs.cfa_reg == stack_pointer_rtx)
10321 m->fs.cfa_reg = hard_frame_pointer_rtx;
10322 m->fs.fp_offset = m->fs.sp_offset;
10323 m->fs.fp_valid = true;
10327 if (!int_registers_saved)
10329 /* If saving registers via PUSH, do so now. */
10330 if (!frame.save_regs_using_mov)
10332 ix86_emit_save_regs ();
10333 int_registers_saved = true;
10334 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10337 /* When using red zone we may start register saving before allocating
10338 the stack frame saving one cycle of the prologue. However, avoid
10339 doing this if we have to probe the stack; at least on x86_64 the
10340 stack probe can turn into a call that clobbers a red zone location. */
10341 else if (ix86_using_red_zone ()
10342 && (! TARGET_STACK_PROBE
10343 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10345 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10346 int_registers_saved = true;
10350 if (stack_realign_fp)
10352 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10353 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10355 /* The computation of the size of the re-aligned stack frame means
10356 that we must allocate the size of the register save area before
10357 performing the actual alignment. Otherwise we cannot guarantee
10358 that there's enough storage above the realignment point. */
10359 if (m->fs.sp_offset != frame.sse_reg_save_offset)
10360 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10361 GEN_INT (m->fs.sp_offset
10362 - frame.sse_reg_save_offset),
10363 -1, false);
10365 /* Align the stack. */
10366 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10367 stack_pointer_rtx,
10368 GEN_INT (-align_bytes)));
10370 /* For the purposes of register save area addressing, the stack
10371 pointer is no longer valid. As for the value of sp_offset,
10372 see ix86_compute_frame_layout, which we need to match in order
10373 to pass verification of stack_pointer_offset at the end. */
10374 m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10375 m->fs.sp_valid = false;
10378 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10380 if (flag_stack_usage_info)
10382 /* We start to count from ARG_POINTER. */
10383 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10385 /* If it was realigned, take into account the fake frame. */
10386 if (stack_realign_drap)
10388 if (ix86_static_chain_on_stack)
10389 stack_size += UNITS_PER_WORD;
10391 if (!call_used_regs[REGNO (crtl->drap_reg)])
10392 stack_size += UNITS_PER_WORD;
10394 /* This over-estimates by 1 minimal-stack-alignment-unit but
10395 mitigates that by counting in the new return address slot. */
10396 current_function_dynamic_stack_size
10397 += crtl->stack_alignment_needed / BITS_PER_UNIT;
10400 current_function_static_stack_size = stack_size;
10403 /* On SEH target with very large frame size, allocate an area to save
10404 SSE registers (as the very large allocation won't be described). */
10405 if (TARGET_SEH
10406 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
10407 && !sse_registers_saved)
10409 HOST_WIDE_INT sse_size =
10410 frame.sse_reg_save_offset - frame.reg_save_offset;
10412 gcc_assert (int_registers_saved);
10414 /* No need to do stack checking as the area will be immediately
10415 written. */
10416 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10417 GEN_INT (-sse_size), -1,
10418 m->fs.cfa_reg == stack_pointer_rtx);
10419 allocate -= sse_size;
10420 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10421 sse_registers_saved = true;
10424 /* The stack has already been decremented by the instruction calling us
10425 so probe if the size is non-negative to preserve the protection area. */
10426 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10428 /* We expect the registers to be saved when probes are used. */
10429 gcc_assert (int_registers_saved);
10431 if (STACK_CHECK_MOVING_SP)
10433 ix86_adjust_stack_and_probe (allocate);
10434 allocate = 0;
10436 else
10438 HOST_WIDE_INT size = allocate;
10440 if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10441 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10443 if (TARGET_STACK_PROBE)
10444 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10445 else
10446 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10450 if (allocate == 0)
10452 else if (!ix86_target_stack_probe ()
10453 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10455 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10456 GEN_INT (-allocate), -1,
10457 m->fs.cfa_reg == stack_pointer_rtx);
10459 else
10461 rtx eax = gen_rtx_REG (Pmode, AX_REG);
10462 rtx r10 = NULL;
10463 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10464 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
10465 bool eax_live = false;
10466 bool r10_live = false;
10468 if (TARGET_64BIT)
10469 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10470 if (!TARGET_64BIT_MS_ABI)
10471 eax_live = ix86_eax_live_at_start_p ();
10473 /* Note that SEH directives need to continue tracking the stack
10474 pointer even after the frame pointer has been set up. */
10475 if (eax_live)
10477 insn = emit_insn (gen_push (eax));
10478 allocate -= UNITS_PER_WORD;
10479 if (sp_is_cfa_reg || TARGET_SEH)
10481 if (sp_is_cfa_reg)
10482 m->fs.cfa_offset += UNITS_PER_WORD;
10483 RTX_FRAME_RELATED_P (insn) = 1;
10487 if (r10_live)
10489 r10 = gen_rtx_REG (Pmode, R10_REG);
10490 insn = emit_insn (gen_push (r10));
10491 allocate -= UNITS_PER_WORD;
10492 if (sp_is_cfa_reg || TARGET_SEH)
10494 if (sp_is_cfa_reg)
10495 m->fs.cfa_offset += UNITS_PER_WORD;
10496 RTX_FRAME_RELATED_P (insn) = 1;
10500 emit_move_insn (eax, GEN_INT (allocate));
10501 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10503 /* Use the fact that AX still contains ALLOCATE. */
10504 adjust_stack_insn = (Pmode == DImode
10505 ? gen_pro_epilogue_adjust_stack_di_sub
10506 : gen_pro_epilogue_adjust_stack_si_sub);
10508 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10509 stack_pointer_rtx, eax));
10511 if (sp_is_cfa_reg || TARGET_SEH)
10513 if (sp_is_cfa_reg)
10514 m->fs.cfa_offset += allocate;
10515 RTX_FRAME_RELATED_P (insn) = 1;
10516 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10517 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10518 plus_constant (Pmode, stack_pointer_rtx,
10519 -allocate)));
10521 m->fs.sp_offset += allocate;
10523 if (r10_live && eax_live)
10525 t = choose_baseaddr (m->fs.sp_offset - allocate);
10526 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
10527 gen_frame_mem (word_mode, t));
10528 t = choose_baseaddr (m->fs.sp_offset - allocate - UNITS_PER_WORD);
10529 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
10530 gen_frame_mem (word_mode, t));
10532 else if (eax_live || r10_live)
10534 t = choose_baseaddr (m->fs.sp_offset - allocate);
10535 emit_move_insn (gen_rtx_REG (word_mode,
10536 (eax_live ? AX_REG : R10_REG)),
10537 gen_frame_mem (word_mode, t));
10540 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10542 /* If we havn't already set up the frame pointer, do so now. */
10543 if (frame_pointer_needed && !m->fs.fp_valid)
10545 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10546 GEN_INT (frame.stack_pointer_offset
10547 - frame.hard_frame_pointer_offset));
10548 insn = emit_insn (insn);
10549 RTX_FRAME_RELATED_P (insn) = 1;
10550 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10552 if (m->fs.cfa_reg == stack_pointer_rtx)
10553 m->fs.cfa_reg = hard_frame_pointer_rtx;
10554 m->fs.fp_offset = frame.hard_frame_pointer_offset;
10555 m->fs.fp_valid = true;
10558 if (!int_registers_saved)
10559 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10560 if (!sse_registers_saved)
10561 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10563 pic_reg_used = false;
10564 if (pic_offset_table_rtx
10565 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10566 || crtl->profile))
10568 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
10570 if (alt_pic_reg_used != INVALID_REGNUM)
10571 SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
10573 pic_reg_used = true;
10576 if (pic_reg_used)
10578 if (TARGET_64BIT)
10580 if (ix86_cmodel == CM_LARGE_PIC)
10582 rtx label, tmp_reg;
10584 gcc_assert (Pmode == DImode);
10585 label = gen_label_rtx ();
10586 emit_label (label);
10587 LABEL_PRESERVE_P (label) = 1;
10588 tmp_reg = gen_rtx_REG (Pmode, R11_REG);
10589 gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
10590 insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
10591 label));
10592 insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
10593 insn = emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
10594 pic_offset_table_rtx, tmp_reg));
10596 else
10597 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
10599 else
10601 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
10602 RTX_FRAME_RELATED_P (insn) = 1;
10603 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
10607 /* In the pic_reg_used case, make sure that the got load isn't deleted
10608 when mcount needs it. Blockage to avoid call movement across mcount
10609 call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
10610 note. */
10611 if (crtl->profile && !flag_fentry && pic_reg_used)
10612 emit_insn (gen_prologue_use (pic_offset_table_rtx));
10614 if (crtl->drap_reg && !crtl->stack_realign_needed)
10616 /* vDRAP is setup but after reload it turns out stack realign
10617 isn't necessary, here we will emit prologue to setup DRAP
10618 without stack realign adjustment */
10619 t = choose_baseaddr (0);
10620 emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10623 /* Prevent instructions from being scheduled into register save push
10624 sequence when access to the redzone area is done through frame pointer.
10625 The offset between the frame pointer and the stack pointer is calculated
10626 relative to the value of the stack pointer at the end of the function
10627 prologue, and moving instructions that access redzone area via frame
10628 pointer inside push sequence violates this assumption. */
10629 if (frame_pointer_needed && frame.red_zone_size)
10630 emit_insn (gen_memory_blockage ());
10632 /* Emit cld instruction if stringops are used in the function. */
10633 if (TARGET_CLD && ix86_current_function_needs_cld)
10634 emit_insn (gen_cld ());
10636 /* SEH requires that the prologue end within 256 bytes of the start of
10637 the function. Prevent instruction schedules that would extend that.
10638 Further, prevent alloca modifications to the stack pointer from being
10639 combined with prologue modifications. */
10640 if (TARGET_SEH)
10641 emit_insn (gen_prologue_use (stack_pointer_rtx));
10644 /* Emit code to restore REG using a POP insn. */
10646 static void
10647 ix86_emit_restore_reg_using_pop (rtx reg)
10649 struct machine_function *m = cfun->machine;
10650 rtx insn = emit_insn (gen_pop (reg));
10652 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
10653 m->fs.sp_offset -= UNITS_PER_WORD;
10655 if (m->fs.cfa_reg == crtl->drap_reg
10656 && REGNO (reg) == REGNO (crtl->drap_reg))
10658 /* Previously we'd represented the CFA as an expression
10659 like *(%ebp - 8). We've just popped that value from
10660 the stack, which means we need to reset the CFA to
10661 the drap register. This will remain until we restore
10662 the stack pointer. */
10663 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10664 RTX_FRAME_RELATED_P (insn) = 1;
10666 /* This means that the DRAP register is valid for addressing too. */
10667 m->fs.drap_valid = true;
10668 return;
10671 if (m->fs.cfa_reg == stack_pointer_rtx)
10673 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
10674 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10675 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
10676 RTX_FRAME_RELATED_P (insn) = 1;
10678 m->fs.cfa_offset -= UNITS_PER_WORD;
10681 /* When the frame pointer is the CFA, and we pop it, we are
10682 swapping back to the stack pointer as the CFA. This happens
10683 for stack frames that don't allocate other data, so we assume
10684 the stack pointer is now pointing at the return address, i.e.
10685 the function entry state, which makes the offset be 1 word. */
10686 if (reg == hard_frame_pointer_rtx)
10688 m->fs.fp_valid = false;
10689 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10691 m->fs.cfa_reg = stack_pointer_rtx;
10692 m->fs.cfa_offset -= UNITS_PER_WORD;
10694 add_reg_note (insn, REG_CFA_DEF_CFA,
10695 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10696 GEN_INT (m->fs.cfa_offset)));
10697 RTX_FRAME_RELATED_P (insn) = 1;
10702 /* Emit code to restore saved registers using POP insns. */
10704 static void
10705 ix86_emit_restore_regs_using_pop (void)
10707 unsigned int regno;
10709 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10710 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
10711 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
10714 /* Emit code and notes for the LEAVE instruction. */
10716 static void
10717 ix86_emit_leave (void)
10719 struct machine_function *m = cfun->machine;
10720 rtx insn = emit_insn (ix86_gen_leave ());
10722 ix86_add_queued_cfa_restore_notes (insn);
10724 gcc_assert (m->fs.fp_valid);
10725 m->fs.sp_valid = true;
10726 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
10727 m->fs.fp_valid = false;
10729 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10731 m->fs.cfa_reg = stack_pointer_rtx;
10732 m->fs.cfa_offset = m->fs.sp_offset;
10734 add_reg_note (insn, REG_CFA_DEF_CFA,
10735 plus_constant (Pmode, stack_pointer_rtx,
10736 m->fs.sp_offset));
10737 RTX_FRAME_RELATED_P (insn) = 1;
10739 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
10740 m->fs.fp_offset);
10743 /* Emit code to restore saved registers using MOV insns.
10744 First register is restored from CFA - CFA_OFFSET. */
10745 static void
10746 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
10747 bool maybe_eh_return)
10749 struct machine_function *m = cfun->machine;
10750 unsigned int regno;
10752 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10753 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10755 rtx reg = gen_rtx_REG (word_mode, regno);
10756 rtx insn, mem;
10758 mem = choose_baseaddr (cfa_offset);
10759 mem = gen_frame_mem (word_mode, mem);
10760 insn = emit_move_insn (reg, mem);
10762 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
10764 /* Previously we'd represented the CFA as an expression
10765 like *(%ebp - 8). We've just popped that value from
10766 the stack, which means we need to reset the CFA to
10767 the drap register. This will remain until we restore
10768 the stack pointer. */
10769 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10770 RTX_FRAME_RELATED_P (insn) = 1;
10772 /* This means that the DRAP register is valid for addressing. */
10773 m->fs.drap_valid = true;
10775 else
10776 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10778 cfa_offset -= UNITS_PER_WORD;
10782 /* Emit code to restore saved registers using MOV insns.
10783 First register is restored from CFA - CFA_OFFSET. */
10784 static void
10785 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
10786 bool maybe_eh_return)
10788 unsigned int regno;
10790 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10791 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10793 rtx reg = gen_rtx_REG (V4SFmode, regno);
10794 rtx mem;
10796 mem = choose_baseaddr (cfa_offset);
10797 mem = gen_rtx_MEM (V4SFmode, mem);
10798 set_mem_align (mem, 128);
10799 emit_move_insn (reg, mem);
10801 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10803 cfa_offset -= 16;
10807 /* Restore function stack, frame, and registers. */
10809 void
10810 ix86_expand_epilogue (int style)
10812 struct machine_function *m = cfun->machine;
10813 struct machine_frame_state frame_state_save = m->fs;
10814 struct ix86_frame frame;
10815 bool restore_regs_via_mov;
10816 bool using_drap;
10818 ix86_finalize_stack_realign_flags ();
10819 ix86_compute_frame_layout (&frame);
10821 m->fs.sp_valid = (!frame_pointer_needed
10822 || (crtl->sp_is_unchanging
10823 && !stack_realign_fp));
10824 gcc_assert (!m->fs.sp_valid
10825 || m->fs.sp_offset == frame.stack_pointer_offset);
10827 /* The FP must be valid if the frame pointer is present. */
10828 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
10829 gcc_assert (!m->fs.fp_valid
10830 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
10832 /* We must have *some* valid pointer to the stack frame. */
10833 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
10835 /* The DRAP is never valid at this point. */
10836 gcc_assert (!m->fs.drap_valid);
10838 /* See the comment about red zone and frame
10839 pointer usage in ix86_expand_prologue. */
10840 if (frame_pointer_needed && frame.red_zone_size)
10841 emit_insn (gen_memory_blockage ());
10843 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
10844 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
10846 /* Determine the CFA offset of the end of the red-zone. */
10847 m->fs.red_zone_offset = 0;
10848 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
10850 /* The red-zone begins below the return address. */
10851 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
10853 /* When the register save area is in the aligned portion of
10854 the stack, determine the maximum runtime displacement that
10855 matches up with the aligned frame. */
10856 if (stack_realign_drap)
10857 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
10858 + UNITS_PER_WORD);
10861 /* Special care must be taken for the normal return case of a function
10862 using eh_return: the eax and edx registers are marked as saved, but
10863 not restored along this path. Adjust the save location to match. */
10864 if (crtl->calls_eh_return && style != 2)
10865 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
10867 /* EH_RETURN requires the use of moves to function properly. */
10868 if (crtl->calls_eh_return)
10869 restore_regs_via_mov = true;
10870 /* SEH requires the use of pops to identify the epilogue. */
10871 else if (TARGET_SEH)
10872 restore_regs_via_mov = false;
10873 /* If we're only restoring one register and sp is not valid then
10874 using a move instruction to restore the register since it's
10875 less work than reloading sp and popping the register. */
10876 else if (!m->fs.sp_valid && frame.nregs <= 1)
10877 restore_regs_via_mov = true;
10878 else if (TARGET_EPILOGUE_USING_MOVE
10879 && cfun->machine->use_fast_prologue_epilogue
10880 && (frame.nregs > 1
10881 || m->fs.sp_offset != frame.reg_save_offset))
10882 restore_regs_via_mov = true;
10883 else if (frame_pointer_needed
10884 && !frame.nregs
10885 && m->fs.sp_offset != frame.reg_save_offset)
10886 restore_regs_via_mov = true;
10887 else if (frame_pointer_needed
10888 && TARGET_USE_LEAVE
10889 && cfun->machine->use_fast_prologue_epilogue
10890 && frame.nregs == 1)
10891 restore_regs_via_mov = true;
10892 else
10893 restore_regs_via_mov = false;
10895 if (restore_regs_via_mov || frame.nsseregs)
10897 /* Ensure that the entire register save area is addressable via
10898 the stack pointer, if we will restore via sp. */
10899 if (TARGET_64BIT
10900 && m->fs.sp_offset > 0x7fffffff
10901 && !(m->fs.fp_valid || m->fs.drap_valid)
10902 && (frame.nsseregs + frame.nregs) != 0)
10904 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10905 GEN_INT (m->fs.sp_offset
10906 - frame.sse_reg_save_offset),
10907 style,
10908 m->fs.cfa_reg == stack_pointer_rtx);
10912 /* If there are any SSE registers to restore, then we have to do it
10913 via moves, since there's obviously no pop for SSE regs. */
10914 if (frame.nsseregs)
10915 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
10916 style == 2);
10918 if (restore_regs_via_mov)
10920 rtx t;
10922 if (frame.nregs)
10923 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
10925 /* eh_return epilogues need %ecx added to the stack pointer. */
10926 if (style == 2)
10928 rtx insn, sa = EH_RETURN_STACKADJ_RTX;
10930 /* Stack align doesn't work with eh_return. */
10931 gcc_assert (!stack_realign_drap);
10932 /* Neither does regparm nested functions. */
10933 gcc_assert (!ix86_static_chain_on_stack);
10935 if (frame_pointer_needed)
10937 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
10938 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
10939 emit_insn (gen_rtx_SET (VOIDmode, sa, t));
10941 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
10942 insn = emit_move_insn (hard_frame_pointer_rtx, t);
10944 /* Note that we use SA as a temporary CFA, as the return
10945 address is at the proper place relative to it. We
10946 pretend this happens at the FP restore insn because
10947 prior to this insn the FP would be stored at the wrong
10948 offset relative to SA, and after this insn we have no
10949 other reasonable register to use for the CFA. We don't
10950 bother resetting the CFA to the SP for the duration of
10951 the return insn. */
10952 add_reg_note (insn, REG_CFA_DEF_CFA,
10953 plus_constant (Pmode, sa, UNITS_PER_WORD));
10954 ix86_add_queued_cfa_restore_notes (insn);
10955 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
10956 RTX_FRAME_RELATED_P (insn) = 1;
10958 m->fs.cfa_reg = sa;
10959 m->fs.cfa_offset = UNITS_PER_WORD;
10960 m->fs.fp_valid = false;
10962 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
10963 const0_rtx, style, false);
10965 else
10967 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
10968 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
10969 insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
10970 ix86_add_queued_cfa_restore_notes (insn);
10972 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
10973 if (m->fs.cfa_offset != UNITS_PER_WORD)
10975 m->fs.cfa_offset = UNITS_PER_WORD;
10976 add_reg_note (insn, REG_CFA_DEF_CFA,
10977 plus_constant (Pmode, stack_pointer_rtx,
10978 UNITS_PER_WORD));
10979 RTX_FRAME_RELATED_P (insn) = 1;
10982 m->fs.sp_offset = UNITS_PER_WORD;
10983 m->fs.sp_valid = true;
10986 else
10988 /* SEH requires that the function end with (1) a stack adjustment
10989 if necessary, (2) a sequence of pops, and (3) a return or
10990 jump instruction. Prevent insns from the function body from
10991 being scheduled into this sequence. */
10992 if (TARGET_SEH)
10994 /* Prevent a catch region from being adjacent to the standard
10995 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
10996 several other flags that would be interesting to test are
10997 not yet set up. */
10998 if (flag_non_call_exceptions)
10999 emit_insn (gen_nops (const1_rtx));
11000 else
11001 emit_insn (gen_blockage ());
11004 /* First step is to deallocate the stack frame so that we can
11005 pop the registers. Also do it on SEH target for very large
11006 frame as the emitted instructions aren't allowed by the ABI in
11007 epilogues. */
11008 if (!m->fs.sp_valid
11009 || (TARGET_SEH
11010 && (m->fs.sp_offset - frame.reg_save_offset
11011 >= SEH_MAX_FRAME_SIZE)))
11013 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
11014 GEN_INT (m->fs.fp_offset
11015 - frame.reg_save_offset),
11016 style, false);
11018 else if (m->fs.sp_offset != frame.reg_save_offset)
11020 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11021 GEN_INT (m->fs.sp_offset
11022 - frame.reg_save_offset),
11023 style,
11024 m->fs.cfa_reg == stack_pointer_rtx);
11027 ix86_emit_restore_regs_using_pop ();
11030 /* If we used a stack pointer and haven't already got rid of it,
11031 then do so now. */
11032 if (m->fs.fp_valid)
11034 /* If the stack pointer is valid and pointing at the frame
11035 pointer store address, then we only need a pop. */
11036 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
11037 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11038 /* Leave results in shorter dependency chains on CPUs that are
11039 able to grok it fast. */
11040 else if (TARGET_USE_LEAVE
11041 || optimize_function_for_size_p (cfun)
11042 || !cfun->machine->use_fast_prologue_epilogue)
11043 ix86_emit_leave ();
11044 else
11046 pro_epilogue_adjust_stack (stack_pointer_rtx,
11047 hard_frame_pointer_rtx,
11048 const0_rtx, style, !using_drap);
11049 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11053 if (using_drap)
11055 int param_ptr_offset = UNITS_PER_WORD;
11056 rtx insn;
11058 gcc_assert (stack_realign_drap);
11060 if (ix86_static_chain_on_stack)
11061 param_ptr_offset += UNITS_PER_WORD;
11062 if (!call_used_regs[REGNO (crtl->drap_reg)])
11063 param_ptr_offset += UNITS_PER_WORD;
11065 insn = emit_insn (gen_rtx_SET
11066 (VOIDmode, stack_pointer_rtx,
11067 gen_rtx_PLUS (Pmode,
11068 crtl->drap_reg,
11069 GEN_INT (-param_ptr_offset))));
11070 m->fs.cfa_reg = stack_pointer_rtx;
11071 m->fs.cfa_offset = param_ptr_offset;
11072 m->fs.sp_offset = param_ptr_offset;
11073 m->fs.realigned = false;
11075 add_reg_note (insn, REG_CFA_DEF_CFA,
11076 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11077 GEN_INT (param_ptr_offset)));
11078 RTX_FRAME_RELATED_P (insn) = 1;
11080 if (!call_used_regs[REGNO (crtl->drap_reg)])
11081 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
11084 /* At this point the stack pointer must be valid, and we must have
11085 restored all of the registers. We may not have deallocated the
11086 entire stack frame. We've delayed this until now because it may
11087 be possible to merge the local stack deallocation with the
11088 deallocation forced by ix86_static_chain_on_stack. */
11089 gcc_assert (m->fs.sp_valid);
11090 gcc_assert (!m->fs.fp_valid);
11091 gcc_assert (!m->fs.realigned);
11092 if (m->fs.sp_offset != UNITS_PER_WORD)
11094 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11095 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
11096 style, true);
11098 else
11099 ix86_add_queued_cfa_restore_notes (get_last_insn ());
11101 /* Sibcall epilogues don't want a return instruction. */
11102 if (style == 0)
11104 m->fs = frame_state_save;
11105 return;
11108 if (crtl->args.pops_args && crtl->args.size)
11110 rtx popc = GEN_INT (crtl->args.pops_args);
11112 /* i386 can only pop 64K bytes. If asked to pop more, pop return
11113 address, do explicit add, and jump indirectly to the caller. */
11115 if (crtl->args.pops_args >= 65536)
11117 rtx ecx = gen_rtx_REG (SImode, CX_REG);
11118 rtx insn;
11120 /* There is no "pascal" calling convention in any 64bit ABI. */
11121 gcc_assert (!TARGET_64BIT);
11123 insn = emit_insn (gen_pop (ecx));
11124 m->fs.cfa_offset -= UNITS_PER_WORD;
11125 m->fs.sp_offset -= UNITS_PER_WORD;
11127 add_reg_note (insn, REG_CFA_ADJUST_CFA,
11128 copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
11129 add_reg_note (insn, REG_CFA_REGISTER,
11130 gen_rtx_SET (VOIDmode, ecx, pc_rtx));
11131 RTX_FRAME_RELATED_P (insn) = 1;
11133 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11134 popc, -1, true);
11135 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
11137 else
11138 emit_jump_insn (gen_simple_return_pop_internal (popc));
11140 else
11141 emit_jump_insn (gen_simple_return_internal ());
11143 /* Restore the state back to the state from the prologue,
11144 so that it's correct for the next epilogue. */
11145 m->fs = frame_state_save;
11148 /* Reset from the function's potential modifications. */
11150 static void
11151 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
11152 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
11154 if (pic_offset_table_rtx)
11155 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
11156 #if TARGET_MACHO
11157 /* Mach-O doesn't support labels at the end of objects, so if
11158 it looks like we might want one, insert a NOP. */
11160 rtx insn = get_last_insn ();
11161 rtx deleted_debug_label = NULL_RTX;
11162 while (insn
11163 && NOTE_P (insn)
11164 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
11166 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
11167 notes only, instead set their CODE_LABEL_NUMBER to -1,
11168 otherwise there would be code generation differences
11169 in between -g and -g0. */
11170 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11171 deleted_debug_label = insn;
11172 insn = PREV_INSN (insn);
11174 if (insn
11175 && (LABEL_P (insn)
11176 || (NOTE_P (insn)
11177 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
11178 fputs ("\tnop\n", file);
11179 else if (deleted_debug_label)
11180 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
11181 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11182 CODE_LABEL_NUMBER (insn) = -1;
11184 #endif
11188 /* Return a scratch register to use in the split stack prologue. The
11189 split stack prologue is used for -fsplit-stack. It is the first
11190 instructions in the function, even before the regular prologue.
11191 The scratch register can be any caller-saved register which is not
11192 used for parameters or for the static chain. */
11194 static unsigned int
11195 split_stack_prologue_scratch_regno (void)
11197 if (TARGET_64BIT)
11198 return R11_REG;
11199 else
11201 bool is_fastcall, is_thiscall;
11202 int regparm;
11204 is_fastcall = (lookup_attribute ("fastcall",
11205 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11206 != NULL);
11207 is_thiscall = (lookup_attribute ("thiscall",
11208 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11209 != NULL);
11210 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
11212 if (is_fastcall)
11214 if (DECL_STATIC_CHAIN (cfun->decl))
11216 sorry ("-fsplit-stack does not support fastcall with "
11217 "nested function");
11218 return INVALID_REGNUM;
11220 return AX_REG;
11222 else if (is_thiscall)
11224 if (!DECL_STATIC_CHAIN (cfun->decl))
11225 return DX_REG;
11226 return AX_REG;
11228 else if (regparm < 3)
11230 if (!DECL_STATIC_CHAIN (cfun->decl))
11231 return CX_REG;
11232 else
11234 if (regparm >= 2)
11236 sorry ("-fsplit-stack does not support 2 register "
11237 " parameters for a nested function");
11238 return INVALID_REGNUM;
11240 return DX_REG;
11243 else
11245 /* FIXME: We could make this work by pushing a register
11246 around the addition and comparison. */
11247 sorry ("-fsplit-stack does not support 3 register parameters");
11248 return INVALID_REGNUM;
11253 /* A SYMBOL_REF for the function which allocates new stackspace for
11254 -fsplit-stack. */
11256 static GTY(()) rtx split_stack_fn;
11258 /* A SYMBOL_REF for the more stack function when using the large
11259 model. */
11261 static GTY(()) rtx split_stack_fn_large;
11263 /* Handle -fsplit-stack. These are the first instructions in the
11264 function, even before the regular prologue. */
11266 void
11267 ix86_expand_split_stack_prologue (void)
11269 struct ix86_frame frame;
11270 HOST_WIDE_INT allocate;
11271 unsigned HOST_WIDE_INT args_size;
11272 rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
11273 rtx scratch_reg = NULL_RTX;
11274 rtx varargs_label = NULL_RTX;
11275 rtx fn;
11277 gcc_assert (flag_split_stack && reload_completed);
11279 ix86_finalize_stack_realign_flags ();
11280 ix86_compute_frame_layout (&frame);
11281 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
11283 /* This is the label we will branch to if we have enough stack
11284 space. We expect the basic block reordering pass to reverse this
11285 branch if optimizing, so that we branch in the unlikely case. */
11286 label = gen_label_rtx ();
11288 /* We need to compare the stack pointer minus the frame size with
11289 the stack boundary in the TCB. The stack boundary always gives
11290 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
11291 can compare directly. Otherwise we need to do an addition. */
11293 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
11294 UNSPEC_STACK_CHECK);
11295 limit = gen_rtx_CONST (Pmode, limit);
11296 limit = gen_rtx_MEM (Pmode, limit);
11297 if (allocate < SPLIT_STACK_AVAILABLE)
11298 current = stack_pointer_rtx;
11299 else
11301 unsigned int scratch_regno;
11302 rtx offset;
11304 /* We need a scratch register to hold the stack pointer minus
11305 the required frame size. Since this is the very start of the
11306 function, the scratch register can be any caller-saved
11307 register which is not used for parameters. */
11308 offset = GEN_INT (- allocate);
11309 scratch_regno = split_stack_prologue_scratch_regno ();
11310 if (scratch_regno == INVALID_REGNUM)
11311 return;
11312 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11313 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11315 /* We don't use ix86_gen_add3 in this case because it will
11316 want to split to lea, but when not optimizing the insn
11317 will not be split after this point. */
11318 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11319 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11320 offset)));
11322 else
11324 emit_move_insn (scratch_reg, offset);
11325 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
11326 stack_pointer_rtx));
11328 current = scratch_reg;
11331 ix86_expand_branch (GEU, current, limit, label);
11332 jump_insn = get_last_insn ();
11333 JUMP_LABEL (jump_insn) = label;
11335 /* Mark the jump as very likely to be taken. */
11336 add_reg_note (jump_insn, REG_BR_PROB,
11337 GEN_INT (REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100));
11339 if (split_stack_fn == NULL_RTX)
11340 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11341 fn = split_stack_fn;
11343 /* Get more stack space. We pass in the desired stack space and the
11344 size of the arguments to copy to the new stack. In 32-bit mode
11345 we push the parameters; __morestack will return on a new stack
11346 anyhow. In 64-bit mode we pass the parameters in r10 and
11347 r11. */
11348 allocate_rtx = GEN_INT (allocate);
11349 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11350 call_fusage = NULL_RTX;
11351 if (TARGET_64BIT)
11353 rtx reg10, reg11;
11355 reg10 = gen_rtx_REG (Pmode, R10_REG);
11356 reg11 = gen_rtx_REG (Pmode, R11_REG);
11358 /* If this function uses a static chain, it will be in %r10.
11359 Preserve it across the call to __morestack. */
11360 if (DECL_STATIC_CHAIN (cfun->decl))
11362 rtx rax;
11364 rax = gen_rtx_REG (word_mode, AX_REG);
11365 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
11366 use_reg (&call_fusage, rax);
11369 if (ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11371 HOST_WIDE_INT argval;
11373 gcc_assert (Pmode == DImode);
11374 /* When using the large model we need to load the address
11375 into a register, and we've run out of registers. So we
11376 switch to a different calling convention, and we call a
11377 different function: __morestack_large. We pass the
11378 argument size in the upper 32 bits of r10 and pass the
11379 frame size in the lower 32 bits. */
11380 gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11381 gcc_assert ((args_size & 0xffffffff) == args_size);
11383 if (split_stack_fn_large == NULL_RTX)
11384 split_stack_fn_large =
11385 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11387 if (ix86_cmodel == CM_LARGE_PIC)
11389 rtx label, x;
11391 label = gen_label_rtx ();
11392 emit_label (label);
11393 LABEL_PRESERVE_P (label) = 1;
11394 emit_insn (gen_set_rip_rex64 (reg10, label));
11395 emit_insn (gen_set_got_offset_rex64 (reg11, label));
11396 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
11397 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11398 UNSPEC_GOT);
11399 x = gen_rtx_CONST (Pmode, x);
11400 emit_move_insn (reg11, x);
11401 x = gen_rtx_PLUS (Pmode, reg10, reg11);
11402 x = gen_const_mem (Pmode, x);
11403 emit_move_insn (reg11, x);
11405 else
11406 emit_move_insn (reg11, split_stack_fn_large);
11408 fn = reg11;
11410 argval = ((args_size << 16) << 16) + allocate;
11411 emit_move_insn (reg10, GEN_INT (argval));
11413 else
11415 emit_move_insn (reg10, allocate_rtx);
11416 emit_move_insn (reg11, GEN_INT (args_size));
11417 use_reg (&call_fusage, reg11);
11420 use_reg (&call_fusage, reg10);
11422 else
11424 emit_insn (gen_push (GEN_INT (args_size)));
11425 emit_insn (gen_push (allocate_rtx));
11427 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11428 GEN_INT (UNITS_PER_WORD), constm1_rtx,
11429 NULL_RTX, false);
11430 add_function_usage_to (call_insn, call_fusage);
11432 /* In order to make call/return prediction work right, we now need
11433 to execute a return instruction. See
11434 libgcc/config/i386/morestack.S for the details on how this works.
11436 For flow purposes gcc must not see this as a return
11437 instruction--we need control flow to continue at the subsequent
11438 label. Therefore, we use an unspec. */
11439 gcc_assert (crtl->args.pops_args < 65536);
11440 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11442 /* If we are in 64-bit mode and this function uses a static chain,
11443 we saved %r10 in %rax before calling _morestack. */
11444 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11445 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
11446 gen_rtx_REG (word_mode, AX_REG));
11448 /* If this function calls va_start, we need to store a pointer to
11449 the arguments on the old stack, because they may not have been
11450 all copied to the new stack. At this point the old stack can be
11451 found at the frame pointer value used by __morestack, because
11452 __morestack has set that up before calling back to us. Here we
11453 store that pointer in a scratch register, and in
11454 ix86_expand_prologue we store the scratch register in a stack
11455 slot. */
11456 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11458 unsigned int scratch_regno;
11459 rtx frame_reg;
11460 int words;
11462 scratch_regno = split_stack_prologue_scratch_regno ();
11463 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11464 frame_reg = gen_rtx_REG (Pmode, BP_REG);
11466 /* 64-bit:
11467 fp -> old fp value
11468 return address within this function
11469 return address of caller of this function
11470 stack arguments
11471 So we add three words to get to the stack arguments.
11473 32-bit:
11474 fp -> old fp value
11475 return address within this function
11476 first argument to __morestack
11477 second argument to __morestack
11478 return address of caller of this function
11479 stack arguments
11480 So we add five words to get to the stack arguments.
11482 words = TARGET_64BIT ? 3 : 5;
11483 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11484 gen_rtx_PLUS (Pmode, frame_reg,
11485 GEN_INT (words * UNITS_PER_WORD))));
11487 varargs_label = gen_label_rtx ();
11488 emit_jump_insn (gen_jump (varargs_label));
11489 JUMP_LABEL (get_last_insn ()) = varargs_label;
11491 emit_barrier ();
11494 emit_label (label);
11495 LABEL_NUSES (label) = 1;
11497 /* If this function calls va_start, we now have to set the scratch
11498 register for the case where we do not call __morestack. In this
11499 case we need to set it based on the stack pointer. */
11500 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11502 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11503 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11504 GEN_INT (UNITS_PER_WORD))));
11506 emit_label (varargs_label);
11507 LABEL_NUSES (varargs_label) = 1;
11511 /* We may have to tell the dataflow pass that the split stack prologue
11512 is initializing a scratch register. */
11514 static void
11515 ix86_live_on_entry (bitmap regs)
11517 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11519 gcc_assert (flag_split_stack);
11520 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11524 /* Determine if op is suitable SUBREG RTX for address. */
11526 static bool
11527 ix86_address_subreg_operand (rtx op)
11529 enum machine_mode mode;
11531 if (!REG_P (op))
11532 return false;
11534 mode = GET_MODE (op);
11536 if (GET_MODE_CLASS (mode) != MODE_INT)
11537 return false;
11539 /* Don't allow SUBREGs that span more than a word. It can lead to spill
11540 failures when the register is one word out of a two word structure. */
11541 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
11542 return false;
11544 /* Allow only SUBREGs of non-eliminable hard registers. */
11545 return register_no_elim_operand (op, mode);
11548 /* Extract the parts of an RTL expression that is a valid memory address
11549 for an instruction. Return 0 if the structure of the address is
11550 grossly off. Return -1 if the address contains ASHIFT, so it is not
11551 strictly valid, but still used for computing length of lea instruction. */
11554 ix86_decompose_address (rtx addr, struct ix86_address *out)
11556 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11557 rtx base_reg, index_reg;
11558 HOST_WIDE_INT scale = 1;
11559 rtx scale_rtx = NULL_RTX;
11560 rtx tmp;
11561 int retval = 1;
11562 enum ix86_address_seg seg = SEG_DEFAULT;
11564 /* Allow zero-extended SImode addresses,
11565 they will be emitted with addr32 prefix. */
11566 if (TARGET_64BIT && GET_MODE (addr) == DImode)
11568 if (GET_CODE (addr) == ZERO_EXTEND
11569 && GET_MODE (XEXP (addr, 0)) == SImode)
11571 addr = XEXP (addr, 0);
11572 if (CONST_INT_P (addr))
11573 return 0;
11575 else if (GET_CODE (addr) == AND
11576 && const_32bit_mask (XEXP (addr, 1), DImode))
11578 addr = simplify_gen_subreg (SImode, XEXP (addr, 0), DImode, 0);
11579 if (addr == NULL_RTX)
11580 return 0;
11582 if (CONST_INT_P (addr))
11583 return 0;
11587 /* Allow SImode subregs of DImode addresses,
11588 they will be emitted with addr32 prefix. */
11589 if (TARGET_64BIT && GET_MODE (addr) == SImode)
11591 if (GET_CODE (addr) == SUBREG
11592 && GET_MODE (SUBREG_REG (addr)) == DImode)
11594 addr = SUBREG_REG (addr);
11595 if (CONST_INT_P (addr))
11596 return 0;
11600 if (REG_P (addr))
11601 base = addr;
11602 else if (GET_CODE (addr) == SUBREG)
11604 if (ix86_address_subreg_operand (SUBREG_REG (addr)))
11605 base = addr;
11606 else
11607 return 0;
11609 else if (GET_CODE (addr) == PLUS)
11611 rtx addends[4], op;
11612 int n = 0, i;
11614 op = addr;
11617 if (n >= 4)
11618 return 0;
11619 addends[n++] = XEXP (op, 1);
11620 op = XEXP (op, 0);
11622 while (GET_CODE (op) == PLUS);
11623 if (n >= 4)
11624 return 0;
11625 addends[n] = op;
11627 for (i = n; i >= 0; --i)
11629 op = addends[i];
11630 switch (GET_CODE (op))
11632 case MULT:
11633 if (index)
11634 return 0;
11635 index = XEXP (op, 0);
11636 scale_rtx = XEXP (op, 1);
11637 break;
11639 case ASHIFT:
11640 if (index)
11641 return 0;
11642 index = XEXP (op, 0);
11643 tmp = XEXP (op, 1);
11644 if (!CONST_INT_P (tmp))
11645 return 0;
11646 scale = INTVAL (tmp);
11647 if ((unsigned HOST_WIDE_INT) scale > 3)
11648 return 0;
11649 scale = 1 << scale;
11650 break;
11652 case ZERO_EXTEND:
11653 op = XEXP (op, 0);
11654 if (GET_CODE (op) != UNSPEC)
11655 return 0;
11656 /* FALLTHRU */
11658 case UNSPEC:
11659 if (XINT (op, 1) == UNSPEC_TP
11660 && TARGET_TLS_DIRECT_SEG_REFS
11661 && seg == SEG_DEFAULT)
11662 seg = TARGET_64BIT ? SEG_FS : SEG_GS;
11663 else
11664 return 0;
11665 break;
11667 case SUBREG:
11668 if (!ix86_address_subreg_operand (SUBREG_REG (op)))
11669 return 0;
11670 /* FALLTHRU */
11672 case REG:
11673 if (!base)
11674 base = op;
11675 else if (!index)
11676 index = op;
11677 else
11678 return 0;
11679 break;
11681 case CONST:
11682 case CONST_INT:
11683 case SYMBOL_REF:
11684 case LABEL_REF:
11685 if (disp)
11686 return 0;
11687 disp = op;
11688 break;
11690 default:
11691 return 0;
11695 else if (GET_CODE (addr) == MULT)
11697 index = XEXP (addr, 0); /* index*scale */
11698 scale_rtx = XEXP (addr, 1);
11700 else if (GET_CODE (addr) == ASHIFT)
11702 /* We're called for lea too, which implements ashift on occasion. */
11703 index = XEXP (addr, 0);
11704 tmp = XEXP (addr, 1);
11705 if (!CONST_INT_P (tmp))
11706 return 0;
11707 scale = INTVAL (tmp);
11708 if ((unsigned HOST_WIDE_INT) scale > 3)
11709 return 0;
11710 scale = 1 << scale;
11711 retval = -1;
11713 else if (CONST_INT_P (addr))
11715 if (!x86_64_immediate_operand (addr, VOIDmode))
11716 return 0;
11718 /* Constant addresses are sign extended to 64bit, we have to
11719 prevent addresses from 0x80000000 to 0xffffffff in x32 mode. */
11720 if (TARGET_X32
11721 && val_signbit_known_set_p (SImode, INTVAL (addr)))
11722 return 0;
11724 disp = addr;
11726 else
11727 disp = addr; /* displacement */
11729 if (index)
11731 if (REG_P (index))
11733 else if (GET_CODE (index) == SUBREG
11734 && ix86_address_subreg_operand (SUBREG_REG (index)))
11736 else
11737 return 0;
11740 /* Address override works only on the (%reg) part of %fs:(%reg). */
11741 if (seg != SEG_DEFAULT
11742 && ((base && GET_MODE (base) != word_mode)
11743 || (index && GET_MODE (index) != word_mode)))
11744 return 0;
11746 /* Extract the integral value of scale. */
11747 if (scale_rtx)
11749 if (!CONST_INT_P (scale_rtx))
11750 return 0;
11751 scale = INTVAL (scale_rtx);
11754 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
11755 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
11757 /* Avoid useless 0 displacement. */
11758 if (disp == const0_rtx && (base || index))
11759 disp = NULL_RTX;
11761 /* Allow arg pointer and stack pointer as index if there is not scaling. */
11762 if (base_reg && index_reg && scale == 1
11763 && (index_reg == arg_pointer_rtx
11764 || index_reg == frame_pointer_rtx
11765 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
11767 rtx tmp;
11768 tmp = base, base = index, index = tmp;
11769 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
11772 /* Special case: %ebp cannot be encoded as a base without a displacement.
11773 Similarly %r13. */
11774 if (!disp
11775 && base_reg
11776 && (base_reg == hard_frame_pointer_rtx
11777 || base_reg == frame_pointer_rtx
11778 || base_reg == arg_pointer_rtx
11779 || (REG_P (base_reg)
11780 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
11781 || REGNO (base_reg) == R13_REG))))
11782 disp = const0_rtx;
11784 /* Special case: on K6, [%esi] makes the instruction vector decoded.
11785 Avoid this by transforming to [%esi+0].
11786 Reload calls address legitimization without cfun defined, so we need
11787 to test cfun for being non-NULL. */
11788 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
11789 && base_reg && !index_reg && !disp
11790 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
11791 disp = const0_rtx;
11793 /* Special case: encode reg+reg instead of reg*2. */
11794 if (!base && index && scale == 2)
11795 base = index, base_reg = index_reg, scale = 1;
11797 /* Special case: scaling cannot be encoded without base or displacement. */
11798 if (!base && !disp && index && scale != 1)
11799 disp = const0_rtx;
11801 out->base = base;
11802 out->index = index;
11803 out->disp = disp;
11804 out->scale = scale;
11805 out->seg = seg;
11807 return retval;
11810 /* Return cost of the memory address x.
11811 For i386, it is better to use a complex address than let gcc copy
11812 the address into a reg and make a new pseudo. But not if the address
11813 requires to two regs - that would mean more pseudos with longer
11814 lifetimes. */
11815 static int
11816 ix86_address_cost (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED,
11817 addr_space_t as ATTRIBUTE_UNUSED,
11818 bool speed ATTRIBUTE_UNUSED)
11820 struct ix86_address parts;
11821 int cost = 1;
11822 int ok = ix86_decompose_address (x, &parts);
11824 gcc_assert (ok);
11826 if (parts.base && GET_CODE (parts.base) == SUBREG)
11827 parts.base = SUBREG_REG (parts.base);
11828 if (parts.index && GET_CODE (parts.index) == SUBREG)
11829 parts.index = SUBREG_REG (parts.index);
11831 /* Attempt to minimize number of registers in the address. */
11832 if ((parts.base
11833 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
11834 || (parts.index
11835 && (!REG_P (parts.index)
11836 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
11837 cost++;
11839 if (parts.base
11840 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
11841 && parts.index
11842 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
11843 && parts.base != parts.index)
11844 cost++;
11846 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
11847 since it's predecode logic can't detect the length of instructions
11848 and it degenerates to vector decoded. Increase cost of such
11849 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
11850 to split such addresses or even refuse such addresses at all.
11852 Following addressing modes are affected:
11853 [base+scale*index]
11854 [scale*index+disp]
11855 [base+index]
11857 The first and last case may be avoidable by explicitly coding the zero in
11858 memory address, but I don't have AMD-K6 machine handy to check this
11859 theory. */
11861 if (TARGET_K6
11862 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
11863 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
11864 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
11865 cost += 10;
11867 return cost;
11870 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
11871 this is used for to form addresses to local data when -fPIC is in
11872 use. */
11874 static bool
11875 darwin_local_data_pic (rtx disp)
11877 return (GET_CODE (disp) == UNSPEC
11878 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
11881 /* Determine if a given RTX is a valid constant. We already know this
11882 satisfies CONSTANT_P. */
11884 static bool
11885 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
11887 switch (GET_CODE (x))
11889 case CONST:
11890 x = XEXP (x, 0);
11892 if (GET_CODE (x) == PLUS)
11894 if (!CONST_INT_P (XEXP (x, 1)))
11895 return false;
11896 x = XEXP (x, 0);
11899 if (TARGET_MACHO && darwin_local_data_pic (x))
11900 return true;
11902 /* Only some unspecs are valid as "constants". */
11903 if (GET_CODE (x) == UNSPEC)
11904 switch (XINT (x, 1))
11906 case UNSPEC_GOT:
11907 case UNSPEC_GOTOFF:
11908 case UNSPEC_PLTOFF:
11909 return TARGET_64BIT;
11910 case UNSPEC_TPOFF:
11911 case UNSPEC_NTPOFF:
11912 x = XVECEXP (x, 0, 0);
11913 return (GET_CODE (x) == SYMBOL_REF
11914 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11915 case UNSPEC_DTPOFF:
11916 x = XVECEXP (x, 0, 0);
11917 return (GET_CODE (x) == SYMBOL_REF
11918 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
11919 default:
11920 return false;
11923 /* We must have drilled down to a symbol. */
11924 if (GET_CODE (x) == LABEL_REF)
11925 return true;
11926 if (GET_CODE (x) != SYMBOL_REF)
11927 return false;
11928 /* FALLTHRU */
11930 case SYMBOL_REF:
11931 /* TLS symbols are never valid. */
11932 if (SYMBOL_REF_TLS_MODEL (x))
11933 return false;
11935 /* DLLIMPORT symbols are never valid. */
11936 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
11937 && SYMBOL_REF_DLLIMPORT_P (x))
11938 return false;
11940 #if TARGET_MACHO
11941 /* mdynamic-no-pic */
11942 if (MACHO_DYNAMIC_NO_PIC_P)
11943 return machopic_symbol_defined_p (x);
11944 #endif
11945 break;
11947 case CONST_DOUBLE:
11948 if (GET_MODE (x) == TImode
11949 && x != CONST0_RTX (TImode)
11950 && !TARGET_64BIT)
11951 return false;
11952 break;
11954 case CONST_VECTOR:
11955 if (!standard_sse_constant_p (x))
11956 return false;
11958 default:
11959 break;
11962 /* Otherwise we handle everything else in the move patterns. */
11963 return true;
11966 /* Determine if it's legal to put X into the constant pool. This
11967 is not possible for the address of thread-local symbols, which
11968 is checked above. */
11970 static bool
11971 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
11973 /* We can always put integral constants and vectors in memory. */
11974 switch (GET_CODE (x))
11976 case CONST_INT:
11977 case CONST_DOUBLE:
11978 case CONST_VECTOR:
11979 return false;
11981 default:
11982 break;
11984 return !ix86_legitimate_constant_p (mode, x);
11988 /* Nonzero if the constant value X is a legitimate general operand
11989 when generating PIC code. It is given that flag_pic is on and
11990 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
11992 bool
11993 legitimate_pic_operand_p (rtx x)
11995 rtx inner;
11997 switch (GET_CODE (x))
11999 case CONST:
12000 inner = XEXP (x, 0);
12001 if (GET_CODE (inner) == PLUS
12002 && CONST_INT_P (XEXP (inner, 1)))
12003 inner = XEXP (inner, 0);
12005 /* Only some unspecs are valid as "constants". */
12006 if (GET_CODE (inner) == UNSPEC)
12007 switch (XINT (inner, 1))
12009 case UNSPEC_GOT:
12010 case UNSPEC_GOTOFF:
12011 case UNSPEC_PLTOFF:
12012 return TARGET_64BIT;
12013 case UNSPEC_TPOFF:
12014 x = XVECEXP (inner, 0, 0);
12015 return (GET_CODE (x) == SYMBOL_REF
12016 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
12017 case UNSPEC_MACHOPIC_OFFSET:
12018 return legitimate_pic_address_disp_p (x);
12019 default:
12020 return false;
12022 /* FALLTHRU */
12024 case SYMBOL_REF:
12025 case LABEL_REF:
12026 return legitimate_pic_address_disp_p (x);
12028 default:
12029 return true;
12033 /* Determine if a given CONST RTX is a valid memory displacement
12034 in PIC mode. */
12036 bool
12037 legitimate_pic_address_disp_p (rtx disp)
12039 bool saw_plus;
12041 /* In 64bit mode we can allow direct addresses of symbols and labels
12042 when they are not dynamic symbols. */
12043 if (TARGET_64BIT)
12045 rtx op0 = disp, op1;
12047 switch (GET_CODE (disp))
12049 case LABEL_REF:
12050 return true;
12052 case CONST:
12053 if (GET_CODE (XEXP (disp, 0)) != PLUS)
12054 break;
12055 op0 = XEXP (XEXP (disp, 0), 0);
12056 op1 = XEXP (XEXP (disp, 0), 1);
12057 if (!CONST_INT_P (op1)
12058 || INTVAL (op1) >= 16*1024*1024
12059 || INTVAL (op1) < -16*1024*1024)
12060 break;
12061 if (GET_CODE (op0) == LABEL_REF)
12062 return true;
12063 if (GET_CODE (op0) == CONST
12064 && GET_CODE (XEXP (op0, 0)) == UNSPEC
12065 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
12066 return true;
12067 if (GET_CODE (op0) == UNSPEC
12068 && XINT (op0, 1) == UNSPEC_PCREL)
12069 return true;
12070 if (GET_CODE (op0) != SYMBOL_REF)
12071 break;
12072 /* FALLTHRU */
12074 case SYMBOL_REF:
12075 /* TLS references should always be enclosed in UNSPEC. */
12076 if (SYMBOL_REF_TLS_MODEL (op0))
12077 return false;
12078 if (!SYMBOL_REF_FAR_ADDR_P (op0) && SYMBOL_REF_LOCAL_P (op0)
12079 && ix86_cmodel != CM_LARGE_PIC)
12080 return true;
12081 break;
12083 default:
12084 break;
12087 if (GET_CODE (disp) != CONST)
12088 return false;
12089 disp = XEXP (disp, 0);
12091 if (TARGET_64BIT)
12093 /* We are unsafe to allow PLUS expressions. This limit allowed distance
12094 of GOT tables. We should not need these anyway. */
12095 if (GET_CODE (disp) != UNSPEC
12096 || (XINT (disp, 1) != UNSPEC_GOTPCREL
12097 && XINT (disp, 1) != UNSPEC_GOTOFF
12098 && XINT (disp, 1) != UNSPEC_PCREL
12099 && XINT (disp, 1) != UNSPEC_PLTOFF))
12100 return false;
12102 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
12103 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
12104 return false;
12105 return true;
12108 saw_plus = false;
12109 if (GET_CODE (disp) == PLUS)
12111 if (!CONST_INT_P (XEXP (disp, 1)))
12112 return false;
12113 disp = XEXP (disp, 0);
12114 saw_plus = true;
12117 if (TARGET_MACHO && darwin_local_data_pic (disp))
12118 return true;
12120 if (GET_CODE (disp) != UNSPEC)
12121 return false;
12123 switch (XINT (disp, 1))
12125 case UNSPEC_GOT:
12126 if (saw_plus)
12127 return false;
12128 /* We need to check for both symbols and labels because VxWorks loads
12129 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
12130 details. */
12131 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12132 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
12133 case UNSPEC_GOTOFF:
12134 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
12135 While ABI specify also 32bit relocation but we don't produce it in
12136 small PIC model at all. */
12137 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12138 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
12139 && !TARGET_64BIT)
12140 return gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
12141 return false;
12142 case UNSPEC_GOTTPOFF:
12143 case UNSPEC_GOTNTPOFF:
12144 case UNSPEC_INDNTPOFF:
12145 if (saw_plus)
12146 return false;
12147 disp = XVECEXP (disp, 0, 0);
12148 return (GET_CODE (disp) == SYMBOL_REF
12149 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
12150 case UNSPEC_NTPOFF:
12151 disp = XVECEXP (disp, 0, 0);
12152 return (GET_CODE (disp) == SYMBOL_REF
12153 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
12154 case UNSPEC_DTPOFF:
12155 disp = XVECEXP (disp, 0, 0);
12156 return (GET_CODE (disp) == SYMBOL_REF
12157 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
12160 return false;
12163 /* Our implementation of LEGITIMIZE_RELOAD_ADDRESS. Returns a value to
12164 replace the input X, or the original X if no replacement is called for.
12165 The output parameter *WIN is 1 if the calling macro should goto WIN,
12166 0 if it should not. */
12168 bool
12169 ix86_legitimize_reload_address (rtx x,
12170 enum machine_mode mode ATTRIBUTE_UNUSED,
12171 int opnum, int type,
12172 int ind_levels ATTRIBUTE_UNUSED)
12174 /* Reload can generate:
12176 (plus:DI (plus:DI (unspec:DI [(const_int 0 [0])] UNSPEC_TP)
12177 (reg:DI 97))
12178 (reg:DI 2 cx))
12180 This RTX is rejected from ix86_legitimate_address_p due to
12181 non-strictness of base register 97. Following this rejection,
12182 reload pushes all three components into separate registers,
12183 creating invalid memory address RTX.
12185 Following code reloads only the invalid part of the
12186 memory address RTX. */
12188 if (GET_CODE (x) == PLUS
12189 && REG_P (XEXP (x, 1))
12190 && GET_CODE (XEXP (x, 0)) == PLUS
12191 && REG_P (XEXP (XEXP (x, 0), 1)))
12193 rtx base, index;
12194 bool something_reloaded = false;
12196 base = XEXP (XEXP (x, 0), 1);
12197 if (!REG_OK_FOR_BASE_STRICT_P (base))
12199 push_reload (base, NULL_RTX, &XEXP (XEXP (x, 0), 1), NULL,
12200 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12201 opnum, (enum reload_type) type);
12202 something_reloaded = true;
12205 index = XEXP (x, 1);
12206 if (!REG_OK_FOR_INDEX_STRICT_P (index))
12208 push_reload (index, NULL_RTX, &XEXP (x, 1), NULL,
12209 INDEX_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12210 opnum, (enum reload_type) type);
12211 something_reloaded = true;
12214 gcc_assert (something_reloaded);
12215 return true;
12218 return false;
12221 /* Recognizes RTL expressions that are valid memory addresses for an
12222 instruction. The MODE argument is the machine mode for the MEM
12223 expression that wants to use this address.
12225 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
12226 convert common non-canonical forms to canonical form so that they will
12227 be recognized. */
12229 static bool
12230 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
12231 rtx addr, bool strict)
12233 struct ix86_address parts;
12234 rtx base, index, disp;
12235 HOST_WIDE_INT scale;
12237 if (ix86_decompose_address (addr, &parts) <= 0)
12238 /* Decomposition failed. */
12239 return false;
12241 base = parts.base;
12242 index = parts.index;
12243 disp = parts.disp;
12244 scale = parts.scale;
12246 /* Validate base register. */
12247 if (base)
12249 rtx reg;
12251 if (REG_P (base))
12252 reg = base;
12253 else if (GET_CODE (base) == SUBREG && REG_P (SUBREG_REG (base)))
12254 reg = SUBREG_REG (base);
12255 else
12256 /* Base is not a register. */
12257 return false;
12259 if (GET_MODE (base) != SImode && GET_MODE (base) != DImode)
12260 return false;
12262 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
12263 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
12264 /* Base is not valid. */
12265 return false;
12268 /* Validate index register. */
12269 if (index)
12271 rtx reg;
12273 if (REG_P (index))
12274 reg = index;
12275 else if (GET_CODE (index) == SUBREG && REG_P (SUBREG_REG (index)))
12276 reg = SUBREG_REG (index);
12277 else
12278 /* Index is not a register. */
12279 return false;
12281 if (GET_MODE (index) != SImode && GET_MODE (index) != DImode)
12282 return false;
12284 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
12285 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
12286 /* Index is not valid. */
12287 return false;
12290 /* Index and base should have the same mode. */
12291 if (base && index
12292 && GET_MODE (base) != GET_MODE (index))
12293 return false;
12295 /* Validate scale factor. */
12296 if (scale != 1)
12298 if (!index)
12299 /* Scale without index. */
12300 return false;
12302 if (scale != 2 && scale != 4 && scale != 8)
12303 /* Scale is not a valid multiplier. */
12304 return false;
12307 /* Validate displacement. */
12308 if (disp)
12310 if (GET_CODE (disp) == CONST
12311 && GET_CODE (XEXP (disp, 0)) == UNSPEC
12312 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
12313 switch (XINT (XEXP (disp, 0), 1))
12315 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
12316 used. While ABI specify also 32bit relocations, we don't produce
12317 them at all and use IP relative instead. */
12318 case UNSPEC_GOT:
12319 case UNSPEC_GOTOFF:
12320 gcc_assert (flag_pic);
12321 if (!TARGET_64BIT)
12322 goto is_legitimate_pic;
12324 /* 64bit address unspec. */
12325 return false;
12327 case UNSPEC_GOTPCREL:
12328 case UNSPEC_PCREL:
12329 gcc_assert (flag_pic);
12330 goto is_legitimate_pic;
12332 case UNSPEC_GOTTPOFF:
12333 case UNSPEC_GOTNTPOFF:
12334 case UNSPEC_INDNTPOFF:
12335 case UNSPEC_NTPOFF:
12336 case UNSPEC_DTPOFF:
12337 break;
12339 case UNSPEC_STACK_CHECK:
12340 gcc_assert (flag_split_stack);
12341 break;
12343 default:
12344 /* Invalid address unspec. */
12345 return false;
12348 else if (SYMBOLIC_CONST (disp)
12349 && (flag_pic
12350 || (TARGET_MACHO
12351 #if TARGET_MACHO
12352 && MACHOPIC_INDIRECT
12353 && !machopic_operand_p (disp)
12354 #endif
12358 is_legitimate_pic:
12359 if (TARGET_64BIT && (index || base))
12361 /* foo@dtpoff(%rX) is ok. */
12362 if (GET_CODE (disp) != CONST
12363 || GET_CODE (XEXP (disp, 0)) != PLUS
12364 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
12365 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
12366 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
12367 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
12368 /* Non-constant pic memory reference. */
12369 return false;
12371 else if ((!TARGET_MACHO || flag_pic)
12372 && ! legitimate_pic_address_disp_p (disp))
12373 /* Displacement is an invalid pic construct. */
12374 return false;
12375 #if TARGET_MACHO
12376 else if (MACHO_DYNAMIC_NO_PIC_P
12377 && !ix86_legitimate_constant_p (Pmode, disp))
12378 /* displacment must be referenced via non_lazy_pointer */
12379 return false;
12380 #endif
12382 /* This code used to verify that a symbolic pic displacement
12383 includes the pic_offset_table_rtx register.
12385 While this is good idea, unfortunately these constructs may
12386 be created by "adds using lea" optimization for incorrect
12387 code like:
12389 int a;
12390 int foo(int i)
12392 return *(&a+i);
12395 This code is nonsensical, but results in addressing
12396 GOT table with pic_offset_table_rtx base. We can't
12397 just refuse it easily, since it gets matched by
12398 "addsi3" pattern, that later gets split to lea in the
12399 case output register differs from input. While this
12400 can be handled by separate addsi pattern for this case
12401 that never results in lea, this seems to be easier and
12402 correct fix for crash to disable this test. */
12404 else if (GET_CODE (disp) != LABEL_REF
12405 && !CONST_INT_P (disp)
12406 && (GET_CODE (disp) != CONST
12407 || !ix86_legitimate_constant_p (Pmode, disp))
12408 && (GET_CODE (disp) != SYMBOL_REF
12409 || !ix86_legitimate_constant_p (Pmode, disp)))
12410 /* Displacement is not constant. */
12411 return false;
12412 else if (TARGET_64BIT
12413 && !x86_64_immediate_operand (disp, VOIDmode))
12414 /* Displacement is out of range. */
12415 return false;
12418 /* Everything looks valid. */
12419 return true;
12422 /* Determine if a given RTX is a valid constant address. */
12424 bool
12425 constant_address_p (rtx x)
12427 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12430 /* Return a unique alias set for the GOT. */
12432 static alias_set_type
12433 ix86_GOT_alias_set (void)
12435 static alias_set_type set = -1;
12436 if (set == -1)
12437 set = new_alias_set ();
12438 return set;
12441 /* Return a legitimate reference for ORIG (an address) using the
12442 register REG. If REG is 0, a new pseudo is generated.
12444 There are two types of references that must be handled:
12446 1. Global data references must load the address from the GOT, via
12447 the PIC reg. An insn is emitted to do this load, and the reg is
12448 returned.
12450 2. Static data references, constant pool addresses, and code labels
12451 compute the address as an offset from the GOT, whose base is in
12452 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
12453 differentiate them from global data objects. The returned
12454 address is the PIC reg + an unspec constant.
12456 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12457 reg also appears in the address. */
12459 static rtx
12460 legitimize_pic_address (rtx orig, rtx reg)
12462 rtx addr = orig;
12463 rtx new_rtx = orig;
12465 #if TARGET_MACHO
12466 if (TARGET_MACHO && !TARGET_64BIT)
12468 if (reg == 0)
12469 reg = gen_reg_rtx (Pmode);
12470 /* Use the generic Mach-O PIC machinery. */
12471 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12473 #endif
12475 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12476 new_rtx = addr;
12477 else if (TARGET_64BIT
12478 && ix86_cmodel != CM_SMALL_PIC
12479 && gotoff_operand (addr, Pmode))
12481 rtx tmpreg;
12482 /* This symbol may be referenced via a displacement from the PIC
12483 base address (@GOTOFF). */
12485 if (reload_in_progress)
12486 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12487 if (GET_CODE (addr) == CONST)
12488 addr = XEXP (addr, 0);
12489 if (GET_CODE (addr) == PLUS)
12491 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12492 UNSPEC_GOTOFF);
12493 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12495 else
12496 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12497 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12498 if (!reg)
12499 tmpreg = gen_reg_rtx (Pmode);
12500 else
12501 tmpreg = reg;
12502 emit_move_insn (tmpreg, new_rtx);
12504 if (reg != 0)
12506 new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12507 tmpreg, 1, OPTAB_DIRECT);
12508 new_rtx = reg;
12510 else new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
12512 else if (!TARGET_64BIT && gotoff_operand (addr, Pmode))
12514 /* This symbol may be referenced via a displacement from the PIC
12515 base address (@GOTOFF). */
12517 if (reload_in_progress)
12518 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12519 if (GET_CODE (addr) == CONST)
12520 addr = XEXP (addr, 0);
12521 if (GET_CODE (addr) == PLUS)
12523 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12524 UNSPEC_GOTOFF);
12525 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12527 else
12528 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12529 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12530 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12532 if (reg != 0)
12534 emit_move_insn (reg, new_rtx);
12535 new_rtx = reg;
12538 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
12539 /* We can't use @GOTOFF for text labels on VxWorks;
12540 see gotoff_operand. */
12541 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
12543 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12545 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
12546 return legitimize_dllimport_symbol (addr, true);
12547 if (GET_CODE (addr) == CONST && GET_CODE (XEXP (addr, 0)) == PLUS
12548 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
12549 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
12551 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), true);
12552 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
12556 /* For x64 PE-COFF there is no GOT table. So we use address
12557 directly. */
12558 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
12560 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
12561 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12563 if (reg == 0)
12564 reg = gen_reg_rtx (Pmode);
12565 emit_move_insn (reg, new_rtx);
12566 new_rtx = reg;
12568 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
12570 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
12571 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12572 new_rtx = gen_const_mem (Pmode, new_rtx);
12573 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12575 if (reg == 0)
12576 reg = gen_reg_rtx (Pmode);
12577 /* Use directly gen_movsi, otherwise the address is loaded
12578 into register for CSE. We don't want to CSE this addresses,
12579 instead we CSE addresses from the GOT table, so skip this. */
12580 emit_insn (gen_movsi (reg, new_rtx));
12581 new_rtx = reg;
12583 else
12585 /* This symbol must be referenced via a load from the
12586 Global Offset Table (@GOT). */
12588 if (reload_in_progress)
12589 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12590 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
12591 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12592 if (TARGET_64BIT)
12593 new_rtx = force_reg (Pmode, new_rtx);
12594 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12595 new_rtx = gen_const_mem (Pmode, new_rtx);
12596 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12598 if (reg == 0)
12599 reg = gen_reg_rtx (Pmode);
12600 emit_move_insn (reg, new_rtx);
12601 new_rtx = reg;
12604 else
12606 if (CONST_INT_P (addr)
12607 && !x86_64_immediate_operand (addr, VOIDmode))
12609 if (reg)
12611 emit_move_insn (reg, addr);
12612 new_rtx = reg;
12614 else
12615 new_rtx = force_reg (Pmode, addr);
12617 else if (GET_CODE (addr) == CONST)
12619 addr = XEXP (addr, 0);
12621 /* We must match stuff we generate before. Assume the only
12622 unspecs that can get here are ours. Not that we could do
12623 anything with them anyway.... */
12624 if (GET_CODE (addr) == UNSPEC
12625 || (GET_CODE (addr) == PLUS
12626 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
12627 return orig;
12628 gcc_assert (GET_CODE (addr) == PLUS);
12630 if (GET_CODE (addr) == PLUS)
12632 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
12634 /* Check first to see if this is a constant offset from a @GOTOFF
12635 symbol reference. */
12636 if (gotoff_operand (op0, Pmode)
12637 && CONST_INT_P (op1))
12639 if (!TARGET_64BIT)
12641 if (reload_in_progress)
12642 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12643 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
12644 UNSPEC_GOTOFF);
12645 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
12646 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12647 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12649 if (reg != 0)
12651 emit_move_insn (reg, new_rtx);
12652 new_rtx = reg;
12655 else
12657 if (INTVAL (op1) < -16*1024*1024
12658 || INTVAL (op1) >= 16*1024*1024)
12660 if (!x86_64_immediate_operand (op1, Pmode))
12661 op1 = force_reg (Pmode, op1);
12662 new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
12666 else
12668 rtx base = legitimize_pic_address (op0, reg);
12669 enum machine_mode mode = GET_MODE (base);
12670 new_rtx
12671 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
12673 if (CONST_INT_P (new_rtx))
12675 if (INTVAL (new_rtx) < -16*1024*1024
12676 || INTVAL (new_rtx) >= 16*1024*1024)
12678 if (!x86_64_immediate_operand (new_rtx, mode))
12679 new_rtx = force_reg (mode, new_rtx);
12680 new_rtx
12681 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
12683 else
12684 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
12686 else
12688 if (GET_CODE (new_rtx) == PLUS
12689 && CONSTANT_P (XEXP (new_rtx, 1)))
12691 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
12692 new_rtx = XEXP (new_rtx, 1);
12694 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
12699 return new_rtx;
12702 /* Load the thread pointer. If TO_REG is true, force it into a register. */
12704 static rtx
12705 get_thread_pointer (enum machine_mode tp_mode, bool to_reg)
12707 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
12709 if (GET_MODE (tp) != tp_mode)
12711 gcc_assert (GET_MODE (tp) == SImode);
12712 gcc_assert (tp_mode == DImode);
12714 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
12717 if (to_reg)
12718 tp = copy_to_mode_reg (tp_mode, tp);
12720 return tp;
12723 /* Construct the SYMBOL_REF for the tls_get_addr function. */
12725 static GTY(()) rtx ix86_tls_symbol;
12727 static rtx
12728 ix86_tls_get_addr (void)
12730 if (!ix86_tls_symbol)
12732 const char *sym
12733 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
12734 ? "___tls_get_addr" : "__tls_get_addr");
12736 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
12739 return ix86_tls_symbol;
12742 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
12744 static GTY(()) rtx ix86_tls_module_base_symbol;
12747 ix86_tls_module_base (void)
12749 if (!ix86_tls_module_base_symbol)
12751 ix86_tls_module_base_symbol
12752 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
12754 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
12755 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
12758 return ix86_tls_module_base_symbol;
12761 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
12762 false if we expect this to be used for a memory address and true if
12763 we expect to load the address into a register. */
12765 static rtx
12766 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
12768 rtx dest, base, off;
12769 rtx pic = NULL_RTX, tp = NULL_RTX;
12770 enum machine_mode tp_mode = Pmode;
12771 int type;
12773 switch (model)
12775 case TLS_MODEL_GLOBAL_DYNAMIC:
12776 dest = gen_reg_rtx (Pmode);
12778 if (!TARGET_64BIT)
12780 if (flag_pic)
12781 pic = pic_offset_table_rtx;
12782 else
12784 pic = gen_reg_rtx (Pmode);
12785 emit_insn (gen_set_got (pic));
12789 if (TARGET_GNU2_TLS)
12791 if (TARGET_64BIT)
12792 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
12793 else
12794 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
12796 tp = get_thread_pointer (Pmode, true);
12797 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
12799 if (GET_MODE (x) != Pmode)
12800 x = gen_rtx_ZERO_EXTEND (Pmode, x);
12802 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12804 else
12806 rtx caddr = ix86_tls_get_addr ();
12808 if (TARGET_64BIT)
12810 rtx rax = gen_rtx_REG (Pmode, AX_REG);
12811 rtx insns;
12813 start_sequence ();
12814 emit_call_insn
12815 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
12816 insns = get_insns ();
12817 end_sequence ();
12819 if (GET_MODE (x) != Pmode)
12820 x = gen_rtx_ZERO_EXTEND (Pmode, x);
12822 RTL_CONST_CALL_P (insns) = 1;
12823 emit_libcall_block (insns, dest, rax, x);
12825 else
12826 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
12828 break;
12830 case TLS_MODEL_LOCAL_DYNAMIC:
12831 base = gen_reg_rtx (Pmode);
12833 if (!TARGET_64BIT)
12835 if (flag_pic)
12836 pic = pic_offset_table_rtx;
12837 else
12839 pic = gen_reg_rtx (Pmode);
12840 emit_insn (gen_set_got (pic));
12844 if (TARGET_GNU2_TLS)
12846 rtx tmp = ix86_tls_module_base ();
12848 if (TARGET_64BIT)
12849 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
12850 else
12851 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
12853 tp = get_thread_pointer (Pmode, true);
12854 set_unique_reg_note (get_last_insn (), REG_EQUAL,
12855 gen_rtx_MINUS (Pmode, tmp, tp));
12857 else
12859 rtx caddr = ix86_tls_get_addr ();
12861 if (TARGET_64BIT)
12863 rtx rax = gen_rtx_REG (Pmode, AX_REG);
12864 rtx insns, eqv;
12866 start_sequence ();
12867 emit_call_insn
12868 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
12869 insns = get_insns ();
12870 end_sequence ();
12872 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
12873 share the LD_BASE result with other LD model accesses. */
12874 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
12875 UNSPEC_TLS_LD_BASE);
12877 RTL_CONST_CALL_P (insns) = 1;
12878 emit_libcall_block (insns, base, rax, eqv);
12880 else
12881 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
12884 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
12885 off = gen_rtx_CONST (Pmode, off);
12887 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
12889 if (TARGET_GNU2_TLS)
12891 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
12893 if (GET_MODE (x) != Pmode)
12894 x = gen_rtx_ZERO_EXTEND (Pmode, x);
12896 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12898 break;
12900 case TLS_MODEL_INITIAL_EXEC:
12901 if (TARGET_64BIT)
12903 if (TARGET_SUN_TLS && !TARGET_X32)
12905 /* The Sun linker took the AMD64 TLS spec literally
12906 and can only handle %rax as destination of the
12907 initial executable code sequence. */
12909 dest = gen_reg_rtx (DImode);
12910 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
12911 return dest;
12914 /* Generate DImode references to avoid %fs:(%reg32)
12915 problems and linker IE->LE relaxation bug. */
12916 tp_mode = DImode;
12917 pic = NULL;
12918 type = UNSPEC_GOTNTPOFF;
12920 else if (flag_pic)
12922 if (reload_in_progress)
12923 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12924 pic = pic_offset_table_rtx;
12925 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
12927 else if (!TARGET_ANY_GNU_TLS)
12929 pic = gen_reg_rtx (Pmode);
12930 emit_insn (gen_set_got (pic));
12931 type = UNSPEC_GOTTPOFF;
12933 else
12935 pic = NULL;
12936 type = UNSPEC_INDNTPOFF;
12939 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
12940 off = gen_rtx_CONST (tp_mode, off);
12941 if (pic)
12942 off = gen_rtx_PLUS (tp_mode, pic, off);
12943 off = gen_const_mem (tp_mode, off);
12944 set_mem_alias_set (off, ix86_GOT_alias_set ());
12946 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12948 base = get_thread_pointer (tp_mode,
12949 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12950 off = force_reg (tp_mode, off);
12951 return gen_rtx_PLUS (tp_mode, base, off);
12953 else
12955 base = get_thread_pointer (Pmode, true);
12956 dest = gen_reg_rtx (Pmode);
12957 emit_insn (ix86_gen_sub3 (dest, base, off));
12959 break;
12961 case TLS_MODEL_LOCAL_EXEC:
12962 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
12963 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12964 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
12965 off = gen_rtx_CONST (Pmode, off);
12967 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12969 base = get_thread_pointer (Pmode,
12970 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12971 return gen_rtx_PLUS (Pmode, base, off);
12973 else
12975 base = get_thread_pointer (Pmode, true);
12976 dest = gen_reg_rtx (Pmode);
12977 emit_insn (ix86_gen_sub3 (dest, base, off));
12979 break;
12981 default:
12982 gcc_unreachable ();
12985 return dest;
12988 /* Create or return the unique __imp_DECL dllimport symbol corresponding
12989 to symbol DECL. */
12991 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
12992 htab_t dllimport_map;
12994 static tree
12995 get_dllimport_decl (tree decl)
12997 struct tree_map *h, in;
12998 void **loc;
12999 const char *name;
13000 const char *prefix;
13001 size_t namelen, prefixlen;
13002 char *imp_name;
13003 tree to;
13004 rtx rtl;
13006 if (!dllimport_map)
13007 dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
13009 in.hash = htab_hash_pointer (decl);
13010 in.base.from = decl;
13011 loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
13012 h = (struct tree_map *) *loc;
13013 if (h)
13014 return h->to;
13016 *loc = h = ggc_alloc_tree_map ();
13017 h->hash = in.hash;
13018 h->base.from = decl;
13019 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
13020 VAR_DECL, NULL, ptr_type_node);
13021 DECL_ARTIFICIAL (to) = 1;
13022 DECL_IGNORED_P (to) = 1;
13023 DECL_EXTERNAL (to) = 1;
13024 TREE_READONLY (to) = 1;
13026 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
13027 name = targetm.strip_name_encoding (name);
13028 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
13029 ? "*__imp_" : "*__imp__";
13030 namelen = strlen (name);
13031 prefixlen = strlen (prefix);
13032 imp_name = (char *) alloca (namelen + prefixlen + 1);
13033 memcpy (imp_name, prefix, prefixlen);
13034 memcpy (imp_name + prefixlen, name, namelen + 1);
13036 name = ggc_alloc_string (imp_name, namelen + prefixlen);
13037 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
13038 SET_SYMBOL_REF_DECL (rtl, to);
13039 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL;
13041 rtl = gen_const_mem (Pmode, rtl);
13042 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
13044 SET_DECL_RTL (to, rtl);
13045 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
13047 return to;
13050 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
13051 true if we require the result be a register. */
13053 static rtx
13054 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
13056 tree imp_decl;
13057 rtx x;
13059 gcc_assert (SYMBOL_REF_DECL (symbol));
13060 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol));
13062 x = DECL_RTL (imp_decl);
13063 if (want_reg)
13064 x = force_reg (Pmode, x);
13065 return x;
13068 /* Try machine-dependent ways of modifying an illegitimate address
13069 to be legitimate. If we find one, return the new, valid address.
13070 This macro is used in only one place: `memory_address' in explow.c.
13072 OLDX is the address as it was before break_out_memory_refs was called.
13073 In some cases it is useful to look at this to decide what needs to be done.
13075 It is always safe for this macro to do nothing. It exists to recognize
13076 opportunities to optimize the output.
13078 For the 80386, we handle X+REG by loading X into a register R and
13079 using R+REG. R will go in a general reg and indexing will be used.
13080 However, if REG is a broken-out memory address or multiplication,
13081 nothing needs to be done because REG can certainly go in a general reg.
13083 When -fpic is used, special handling is needed for symbolic references.
13084 See comments by legitimize_pic_address in i386.c for details. */
13086 static rtx
13087 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
13088 enum machine_mode mode)
13090 int changed = 0;
13091 unsigned log;
13093 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
13094 if (log)
13095 return legitimize_tls_address (x, (enum tls_model) log, false);
13096 if (GET_CODE (x) == CONST
13097 && GET_CODE (XEXP (x, 0)) == PLUS
13098 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13099 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
13101 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
13102 (enum tls_model) log, false);
13103 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13106 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
13108 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (x))
13109 return legitimize_dllimport_symbol (x, true);
13110 if (GET_CODE (x) == CONST
13111 && GET_CODE (XEXP (x, 0)) == PLUS
13112 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13113 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (x, 0), 0)))
13115 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (x, 0), 0), true);
13116 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13120 if (flag_pic && SYMBOLIC_CONST (x))
13121 return legitimize_pic_address (x, 0);
13123 #if TARGET_MACHO
13124 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
13125 return machopic_indirect_data_reference (x, 0);
13126 #endif
13128 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
13129 if (GET_CODE (x) == ASHIFT
13130 && CONST_INT_P (XEXP (x, 1))
13131 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
13133 changed = 1;
13134 log = INTVAL (XEXP (x, 1));
13135 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
13136 GEN_INT (1 << log));
13139 if (GET_CODE (x) == PLUS)
13141 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
13143 if (GET_CODE (XEXP (x, 0)) == ASHIFT
13144 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13145 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
13147 changed = 1;
13148 log = INTVAL (XEXP (XEXP (x, 0), 1));
13149 XEXP (x, 0) = gen_rtx_MULT (Pmode,
13150 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
13151 GEN_INT (1 << log));
13154 if (GET_CODE (XEXP (x, 1)) == ASHIFT
13155 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
13156 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
13158 changed = 1;
13159 log = INTVAL (XEXP (XEXP (x, 1), 1));
13160 XEXP (x, 1) = gen_rtx_MULT (Pmode,
13161 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
13162 GEN_INT (1 << log));
13165 /* Put multiply first if it isn't already. */
13166 if (GET_CODE (XEXP (x, 1)) == MULT)
13168 rtx tmp = XEXP (x, 0);
13169 XEXP (x, 0) = XEXP (x, 1);
13170 XEXP (x, 1) = tmp;
13171 changed = 1;
13174 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
13175 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
13176 created by virtual register instantiation, register elimination, and
13177 similar optimizations. */
13178 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
13180 changed = 1;
13181 x = gen_rtx_PLUS (Pmode,
13182 gen_rtx_PLUS (Pmode, XEXP (x, 0),
13183 XEXP (XEXP (x, 1), 0)),
13184 XEXP (XEXP (x, 1), 1));
13187 /* Canonicalize
13188 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
13189 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
13190 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
13191 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
13192 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
13193 && CONSTANT_P (XEXP (x, 1)))
13195 rtx constant;
13196 rtx other = NULL_RTX;
13198 if (CONST_INT_P (XEXP (x, 1)))
13200 constant = XEXP (x, 1);
13201 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
13203 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
13205 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
13206 other = XEXP (x, 1);
13208 else
13209 constant = 0;
13211 if (constant)
13213 changed = 1;
13214 x = gen_rtx_PLUS (Pmode,
13215 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
13216 XEXP (XEXP (XEXP (x, 0), 1), 0)),
13217 plus_constant (Pmode, other,
13218 INTVAL (constant)));
13222 if (changed && ix86_legitimate_address_p (mode, x, false))
13223 return x;
13225 if (GET_CODE (XEXP (x, 0)) == MULT)
13227 changed = 1;
13228 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
13231 if (GET_CODE (XEXP (x, 1)) == MULT)
13233 changed = 1;
13234 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
13237 if (changed
13238 && REG_P (XEXP (x, 1))
13239 && REG_P (XEXP (x, 0)))
13240 return x;
13242 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
13244 changed = 1;
13245 x = legitimize_pic_address (x, 0);
13248 if (changed && ix86_legitimate_address_p (mode, x, false))
13249 return x;
13251 if (REG_P (XEXP (x, 0)))
13253 rtx temp = gen_reg_rtx (Pmode);
13254 rtx val = force_operand (XEXP (x, 1), temp);
13255 if (val != temp)
13257 val = convert_to_mode (Pmode, val, 1);
13258 emit_move_insn (temp, val);
13261 XEXP (x, 1) = temp;
13262 return x;
13265 else if (REG_P (XEXP (x, 1)))
13267 rtx temp = gen_reg_rtx (Pmode);
13268 rtx val = force_operand (XEXP (x, 0), temp);
13269 if (val != temp)
13271 val = convert_to_mode (Pmode, val, 1);
13272 emit_move_insn (temp, val);
13275 XEXP (x, 0) = temp;
13276 return x;
13280 return x;
13283 /* Print an integer constant expression in assembler syntax. Addition
13284 and subtraction are the only arithmetic that may appear in these
13285 expressions. FILE is the stdio stream to write to, X is the rtx, and
13286 CODE is the operand print code from the output string. */
13288 static void
13289 output_pic_addr_const (FILE *file, rtx x, int code)
13291 char buf[256];
13293 switch (GET_CODE (x))
13295 case PC:
13296 gcc_assert (flag_pic);
13297 putc ('.', file);
13298 break;
13300 case SYMBOL_REF:
13301 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
13302 output_addr_const (file, x);
13303 else
13305 const char *name = XSTR (x, 0);
13307 /* Mark the decl as referenced so that cgraph will
13308 output the function. */
13309 if (SYMBOL_REF_DECL (x))
13310 mark_decl_referenced (SYMBOL_REF_DECL (x));
13312 #if TARGET_MACHO
13313 if (MACHOPIC_INDIRECT
13314 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
13315 name = machopic_indirection_name (x, /*stub_p=*/true);
13316 #endif
13317 assemble_name (file, name);
13319 if (!TARGET_MACHO && !(TARGET_64BIT && DEFAULT_ABI == MS_ABI)
13320 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
13321 fputs ("@PLT", file);
13322 break;
13324 case LABEL_REF:
13325 x = XEXP (x, 0);
13326 /* FALLTHRU */
13327 case CODE_LABEL:
13328 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
13329 assemble_name (asm_out_file, buf);
13330 break;
13332 case CONST_INT:
13333 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
13334 break;
13336 case CONST:
13337 /* This used to output parentheses around the expression,
13338 but that does not work on the 386 (either ATT or BSD assembler). */
13339 output_pic_addr_const (file, XEXP (x, 0), code);
13340 break;
13342 case CONST_DOUBLE:
13343 if (GET_MODE (x) == VOIDmode)
13345 /* We can use %d if the number is <32 bits and positive. */
13346 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
13347 fprintf (file, "0x%lx%08lx",
13348 (unsigned long) CONST_DOUBLE_HIGH (x),
13349 (unsigned long) CONST_DOUBLE_LOW (x));
13350 else
13351 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
13353 else
13354 /* We can't handle floating point constants;
13355 TARGET_PRINT_OPERAND must handle them. */
13356 output_operand_lossage ("floating constant misused");
13357 break;
13359 case PLUS:
13360 /* Some assemblers need integer constants to appear first. */
13361 if (CONST_INT_P (XEXP (x, 0)))
13363 output_pic_addr_const (file, XEXP (x, 0), code);
13364 putc ('+', file);
13365 output_pic_addr_const (file, XEXP (x, 1), code);
13367 else
13369 gcc_assert (CONST_INT_P (XEXP (x, 1)));
13370 output_pic_addr_const (file, XEXP (x, 1), code);
13371 putc ('+', file);
13372 output_pic_addr_const (file, XEXP (x, 0), code);
13374 break;
13376 case MINUS:
13377 if (!TARGET_MACHO)
13378 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
13379 output_pic_addr_const (file, XEXP (x, 0), code);
13380 putc ('-', file);
13381 output_pic_addr_const (file, XEXP (x, 1), code);
13382 if (!TARGET_MACHO)
13383 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
13384 break;
13386 case UNSPEC:
13387 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
13389 bool f = i386_asm_output_addr_const_extra (file, x);
13390 gcc_assert (f);
13391 break;
13394 gcc_assert (XVECLEN (x, 0) == 1);
13395 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
13396 switch (XINT (x, 1))
13398 case UNSPEC_GOT:
13399 fputs ("@GOT", file);
13400 break;
13401 case UNSPEC_GOTOFF:
13402 fputs ("@GOTOFF", file);
13403 break;
13404 case UNSPEC_PLTOFF:
13405 fputs ("@PLTOFF", file);
13406 break;
13407 case UNSPEC_PCREL:
13408 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13409 "(%rip)" : "[rip]", file);
13410 break;
13411 case UNSPEC_GOTPCREL:
13412 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13413 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
13414 break;
13415 case UNSPEC_GOTTPOFF:
13416 /* FIXME: This might be @TPOFF in Sun ld too. */
13417 fputs ("@gottpoff", file);
13418 break;
13419 case UNSPEC_TPOFF:
13420 fputs ("@tpoff", file);
13421 break;
13422 case UNSPEC_NTPOFF:
13423 if (TARGET_64BIT)
13424 fputs ("@tpoff", file);
13425 else
13426 fputs ("@ntpoff", file);
13427 break;
13428 case UNSPEC_DTPOFF:
13429 fputs ("@dtpoff", file);
13430 break;
13431 case UNSPEC_GOTNTPOFF:
13432 if (TARGET_64BIT)
13433 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13434 "@gottpoff(%rip)": "@gottpoff[rip]", file);
13435 else
13436 fputs ("@gotntpoff", file);
13437 break;
13438 case UNSPEC_INDNTPOFF:
13439 fputs ("@indntpoff", file);
13440 break;
13441 #if TARGET_MACHO
13442 case UNSPEC_MACHOPIC_OFFSET:
13443 putc ('-', file);
13444 machopic_output_function_base_name (file);
13445 break;
13446 #endif
13447 default:
13448 output_operand_lossage ("invalid UNSPEC as operand");
13449 break;
13451 break;
13453 default:
13454 output_operand_lossage ("invalid expression as operand");
13458 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
13459 We need to emit DTP-relative relocations. */
13461 static void ATTRIBUTE_UNUSED
13462 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
13464 fputs (ASM_LONG, file);
13465 output_addr_const (file, x);
13466 fputs ("@dtpoff", file);
13467 switch (size)
13469 case 4:
13470 break;
13471 case 8:
13472 fputs (", 0", file);
13473 break;
13474 default:
13475 gcc_unreachable ();
13479 /* Return true if X is a representation of the PIC register. This copes
13480 with calls from ix86_find_base_term, where the register might have
13481 been replaced by a cselib value. */
13483 static bool
13484 ix86_pic_register_p (rtx x)
13486 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
13487 return (pic_offset_table_rtx
13488 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
13489 else
13490 return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
13493 /* Helper function for ix86_delegitimize_address.
13494 Attempt to delegitimize TLS local-exec accesses. */
13496 static rtx
13497 ix86_delegitimize_tls_address (rtx orig_x)
13499 rtx x = orig_x, unspec;
13500 struct ix86_address addr;
13502 if (!TARGET_TLS_DIRECT_SEG_REFS)
13503 return orig_x;
13504 if (MEM_P (x))
13505 x = XEXP (x, 0);
13506 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
13507 return orig_x;
13508 if (ix86_decompose_address (x, &addr) == 0
13509 || addr.seg != (TARGET_64BIT ? SEG_FS : SEG_GS)
13510 || addr.disp == NULL_RTX
13511 || GET_CODE (addr.disp) != CONST)
13512 return orig_x;
13513 unspec = XEXP (addr.disp, 0);
13514 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
13515 unspec = XEXP (unspec, 0);
13516 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
13517 return orig_x;
13518 x = XVECEXP (unspec, 0, 0);
13519 gcc_assert (GET_CODE (x) == SYMBOL_REF);
13520 if (unspec != XEXP (addr.disp, 0))
13521 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
13522 if (addr.index)
13524 rtx idx = addr.index;
13525 if (addr.scale != 1)
13526 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
13527 x = gen_rtx_PLUS (Pmode, idx, x);
13529 if (addr.base)
13530 x = gen_rtx_PLUS (Pmode, addr.base, x);
13531 if (MEM_P (orig_x))
13532 x = replace_equiv_address_nv (orig_x, x);
13533 return x;
13536 /* In the name of slightly smaller debug output, and to cater to
13537 general assembler lossage, recognize PIC+GOTOFF and turn it back
13538 into a direct symbol reference.
13540 On Darwin, this is necessary to avoid a crash, because Darwin
13541 has a different PIC label for each routine but the DWARF debugging
13542 information is not associated with any particular routine, so it's
13543 necessary to remove references to the PIC label from RTL stored by
13544 the DWARF output code. */
13546 static rtx
13547 ix86_delegitimize_address (rtx x)
13549 rtx orig_x = delegitimize_mem_from_attrs (x);
13550 /* addend is NULL or some rtx if x is something+GOTOFF where
13551 something doesn't include the PIC register. */
13552 rtx addend = NULL_RTX;
13553 /* reg_addend is NULL or a multiple of some register. */
13554 rtx reg_addend = NULL_RTX;
13555 /* const_addend is NULL or a const_int. */
13556 rtx const_addend = NULL_RTX;
13557 /* This is the result, or NULL. */
13558 rtx result = NULL_RTX;
13560 x = orig_x;
13562 if (MEM_P (x))
13563 x = XEXP (x, 0);
13565 if (TARGET_64BIT)
13567 if (GET_CODE (x) == CONST
13568 && GET_CODE (XEXP (x, 0)) == PLUS
13569 && GET_MODE (XEXP (x, 0)) == Pmode
13570 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13571 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
13572 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
13574 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
13575 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
13576 if (MEM_P (orig_x))
13577 x = replace_equiv_address_nv (orig_x, x);
13578 return x;
13580 if (GET_CODE (x) != CONST
13581 || GET_CODE (XEXP (x, 0)) != UNSPEC
13582 || (XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL
13583 && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL)
13584 || (!MEM_P (orig_x) && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL))
13585 return ix86_delegitimize_tls_address (orig_x);
13586 x = XVECEXP (XEXP (x, 0), 0, 0);
13587 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
13589 x = simplify_gen_subreg (GET_MODE (orig_x), x,
13590 GET_MODE (x), 0);
13591 if (x == NULL_RTX)
13592 return orig_x;
13594 return x;
13597 if (GET_CODE (x) != PLUS
13598 || GET_CODE (XEXP (x, 1)) != CONST)
13599 return ix86_delegitimize_tls_address (orig_x);
13601 if (ix86_pic_register_p (XEXP (x, 0)))
13602 /* %ebx + GOT/GOTOFF */
13604 else if (GET_CODE (XEXP (x, 0)) == PLUS)
13606 /* %ebx + %reg * scale + GOT/GOTOFF */
13607 reg_addend = XEXP (x, 0);
13608 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
13609 reg_addend = XEXP (reg_addend, 1);
13610 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
13611 reg_addend = XEXP (reg_addend, 0);
13612 else
13614 reg_addend = NULL_RTX;
13615 addend = XEXP (x, 0);
13618 else
13619 addend = XEXP (x, 0);
13621 x = XEXP (XEXP (x, 1), 0);
13622 if (GET_CODE (x) == PLUS
13623 && CONST_INT_P (XEXP (x, 1)))
13625 const_addend = XEXP (x, 1);
13626 x = XEXP (x, 0);
13629 if (GET_CODE (x) == UNSPEC
13630 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
13631 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))))
13632 result = XVECEXP (x, 0, 0);
13634 if (TARGET_MACHO && darwin_local_data_pic (x)
13635 && !MEM_P (orig_x))
13636 result = XVECEXP (x, 0, 0);
13638 if (! result)
13639 return ix86_delegitimize_tls_address (orig_x);
13641 if (const_addend)
13642 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
13643 if (reg_addend)
13644 result = gen_rtx_PLUS (Pmode, reg_addend, result);
13645 if (addend)
13647 /* If the rest of original X doesn't involve the PIC register, add
13648 addend and subtract pic_offset_table_rtx. This can happen e.g.
13649 for code like:
13650 leal (%ebx, %ecx, 4), %ecx
13652 movl foo@GOTOFF(%ecx), %edx
13653 in which case we return (%ecx - %ebx) + foo. */
13654 if (pic_offset_table_rtx)
13655 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
13656 pic_offset_table_rtx),
13657 result);
13658 else
13659 return orig_x;
13661 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
13663 result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
13664 if (result == NULL_RTX)
13665 return orig_x;
13667 return result;
13670 /* If X is a machine specific address (i.e. a symbol or label being
13671 referenced as a displacement from the GOT implemented using an
13672 UNSPEC), then return the base term. Otherwise return X. */
13675 ix86_find_base_term (rtx x)
13677 rtx term;
13679 if (TARGET_64BIT)
13681 if (GET_CODE (x) != CONST)
13682 return x;
13683 term = XEXP (x, 0);
13684 if (GET_CODE (term) == PLUS
13685 && (CONST_INT_P (XEXP (term, 1))
13686 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
13687 term = XEXP (term, 0);
13688 if (GET_CODE (term) != UNSPEC
13689 || (XINT (term, 1) != UNSPEC_GOTPCREL
13690 && XINT (term, 1) != UNSPEC_PCREL))
13691 return x;
13693 return XVECEXP (term, 0, 0);
13696 return ix86_delegitimize_address (x);
13699 static void
13700 put_condition_code (enum rtx_code code, enum machine_mode mode, bool reverse,
13701 bool fp, FILE *file)
13703 const char *suffix;
13705 if (mode == CCFPmode || mode == CCFPUmode)
13707 code = ix86_fp_compare_code_to_integer (code);
13708 mode = CCmode;
13710 if (reverse)
13711 code = reverse_condition (code);
13713 switch (code)
13715 case EQ:
13716 switch (mode)
13718 case CCAmode:
13719 suffix = "a";
13720 break;
13722 case CCCmode:
13723 suffix = "c";
13724 break;
13726 case CCOmode:
13727 suffix = "o";
13728 break;
13730 case CCSmode:
13731 suffix = "s";
13732 break;
13734 default:
13735 suffix = "e";
13737 break;
13738 case NE:
13739 switch (mode)
13741 case CCAmode:
13742 suffix = "na";
13743 break;
13745 case CCCmode:
13746 suffix = "nc";
13747 break;
13749 case CCOmode:
13750 suffix = "no";
13751 break;
13753 case CCSmode:
13754 suffix = "ns";
13755 break;
13757 default:
13758 suffix = "ne";
13760 break;
13761 case GT:
13762 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
13763 suffix = "g";
13764 break;
13765 case GTU:
13766 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
13767 Those same assemblers have the same but opposite lossage on cmov. */
13768 if (mode == CCmode)
13769 suffix = fp ? "nbe" : "a";
13770 else if (mode == CCCmode)
13771 suffix = "b";
13772 else
13773 gcc_unreachable ();
13774 break;
13775 case LT:
13776 switch (mode)
13778 case CCNOmode:
13779 case CCGOCmode:
13780 suffix = "s";
13781 break;
13783 case CCmode:
13784 case CCGCmode:
13785 suffix = "l";
13786 break;
13788 default:
13789 gcc_unreachable ();
13791 break;
13792 case LTU:
13793 gcc_assert (mode == CCmode || mode == CCCmode);
13794 suffix = "b";
13795 break;
13796 case GE:
13797 switch (mode)
13799 case CCNOmode:
13800 case CCGOCmode:
13801 suffix = "ns";
13802 break;
13804 case CCmode:
13805 case CCGCmode:
13806 suffix = "ge";
13807 break;
13809 default:
13810 gcc_unreachable ();
13812 break;
13813 case GEU:
13814 /* ??? As above. */
13815 gcc_assert (mode == CCmode || mode == CCCmode);
13816 suffix = fp ? "nb" : "ae";
13817 break;
13818 case LE:
13819 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
13820 suffix = "le";
13821 break;
13822 case LEU:
13823 /* ??? As above. */
13824 if (mode == CCmode)
13825 suffix = "be";
13826 else if (mode == CCCmode)
13827 suffix = fp ? "nb" : "ae";
13828 else
13829 gcc_unreachable ();
13830 break;
13831 case UNORDERED:
13832 suffix = fp ? "u" : "p";
13833 break;
13834 case ORDERED:
13835 suffix = fp ? "nu" : "np";
13836 break;
13837 default:
13838 gcc_unreachable ();
13840 fputs (suffix, file);
13843 /* Print the name of register X to FILE based on its machine mode and number.
13844 If CODE is 'w', pretend the mode is HImode.
13845 If CODE is 'b', pretend the mode is QImode.
13846 If CODE is 'k', pretend the mode is SImode.
13847 If CODE is 'q', pretend the mode is DImode.
13848 If CODE is 'x', pretend the mode is V4SFmode.
13849 If CODE is 't', pretend the mode is V8SFmode.
13850 If CODE is 'h', pretend the reg is the 'high' byte register.
13851 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
13852 If CODE is 'd', duplicate the operand for AVX instruction.
13855 void
13856 print_reg (rtx x, int code, FILE *file)
13858 const char *reg;
13859 unsigned int regno;
13860 bool duplicated = code == 'd' && TARGET_AVX;
13862 if (ASSEMBLER_DIALECT == ASM_ATT)
13863 putc ('%', file);
13865 if (x == pc_rtx)
13867 gcc_assert (TARGET_64BIT);
13868 fputs ("rip", file);
13869 return;
13872 regno = true_regnum (x);
13873 gcc_assert (regno != ARG_POINTER_REGNUM
13874 && regno != FRAME_POINTER_REGNUM
13875 && regno != FLAGS_REG
13876 && regno != FPSR_REG
13877 && regno != FPCR_REG);
13879 if (code == 'w' || MMX_REG_P (x))
13880 code = 2;
13881 else if (code == 'b')
13882 code = 1;
13883 else if (code == 'k')
13884 code = 4;
13885 else if (code == 'q')
13886 code = 8;
13887 else if (code == 'y')
13888 code = 3;
13889 else if (code == 'h')
13890 code = 0;
13891 else if (code == 'x')
13892 code = 16;
13893 else if (code == 't')
13894 code = 32;
13895 else
13896 code = GET_MODE_SIZE (GET_MODE (x));
13898 /* Irritatingly, AMD extended registers use different naming convention
13899 from the normal registers: "r%d[bwd]" */
13900 if (REX_INT_REGNO_P (regno))
13902 gcc_assert (TARGET_64BIT);
13903 putc ('r', file);
13904 fprint_ul (file, regno - FIRST_REX_INT_REG + 8);
13905 switch (code)
13907 case 0:
13908 error ("extended registers have no high halves");
13909 break;
13910 case 1:
13911 putc ('b', file);
13912 break;
13913 case 2:
13914 putc ('w', file);
13915 break;
13916 case 4:
13917 putc ('d', file);
13918 break;
13919 case 8:
13920 /* no suffix */
13921 break;
13922 default:
13923 error ("unsupported operand size for extended register");
13924 break;
13926 return;
13929 reg = NULL;
13930 switch (code)
13932 case 3:
13933 if (STACK_TOP_P (x))
13935 reg = "st(0)";
13936 break;
13938 /* FALLTHRU */
13939 case 8:
13940 case 4:
13941 case 12:
13942 if (! ANY_FP_REG_P (x))
13943 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
13944 /* FALLTHRU */
13945 case 16:
13946 case 2:
13947 normal:
13948 reg = hi_reg_name[regno];
13949 break;
13950 case 1:
13951 if (regno >= ARRAY_SIZE (qi_reg_name))
13952 goto normal;
13953 reg = qi_reg_name[regno];
13954 break;
13955 case 0:
13956 if (regno >= ARRAY_SIZE (qi_high_reg_name))
13957 goto normal;
13958 reg = qi_high_reg_name[regno];
13959 break;
13960 case 32:
13961 if (SSE_REG_P (x))
13963 gcc_assert (!duplicated);
13964 putc ('y', file);
13965 fputs (hi_reg_name[regno] + 1, file);
13966 return;
13968 break;
13969 default:
13970 gcc_unreachable ();
13973 fputs (reg, file);
13974 if (duplicated)
13976 if (ASSEMBLER_DIALECT == ASM_ATT)
13977 fprintf (file, ", %%%s", reg);
13978 else
13979 fprintf (file, ", %s", reg);
13983 /* Locate some local-dynamic symbol still in use by this function
13984 so that we can print its name in some tls_local_dynamic_base
13985 pattern. */
13987 static int
13988 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
13990 rtx x = *px;
13992 if (GET_CODE (x) == SYMBOL_REF
13993 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
13995 cfun->machine->some_ld_name = XSTR (x, 0);
13996 return 1;
13999 return 0;
14002 static const char *
14003 get_some_local_dynamic_name (void)
14005 rtx insn;
14007 if (cfun->machine->some_ld_name)
14008 return cfun->machine->some_ld_name;
14010 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
14011 if (NONDEBUG_INSN_P (insn)
14012 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
14013 return cfun->machine->some_ld_name;
14015 return NULL;
14018 /* Meaning of CODE:
14019 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
14020 C -- print opcode suffix for set/cmov insn.
14021 c -- like C, but print reversed condition
14022 F,f -- likewise, but for floating-point.
14023 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
14024 otherwise nothing
14025 R -- print the prefix for register names.
14026 z -- print the opcode suffix for the size of the current operand.
14027 Z -- likewise, with special suffixes for x87 instructions.
14028 * -- print a star (in certain assembler syntax)
14029 A -- print an absolute memory reference.
14030 E -- print address with DImode register names if TARGET_64BIT.
14031 w -- print the operand as if it's a "word" (HImode) even if it isn't.
14032 s -- print a shift double count, followed by the assemblers argument
14033 delimiter.
14034 b -- print the QImode name of the register for the indicated operand.
14035 %b0 would print %al if operands[0] is reg 0.
14036 w -- likewise, print the HImode name of the register.
14037 k -- likewise, print the SImode name of the register.
14038 q -- likewise, print the DImode name of the register.
14039 x -- likewise, print the V4SFmode name of the register.
14040 t -- likewise, print the V8SFmode name of the register.
14041 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
14042 y -- print "st(0)" instead of "st" as a register.
14043 d -- print duplicated register operand for AVX instruction.
14044 D -- print condition for SSE cmp instruction.
14045 P -- if PIC, print an @PLT suffix.
14046 p -- print raw symbol name.
14047 X -- don't print any sort of PIC '@' suffix for a symbol.
14048 & -- print some in-use local-dynamic symbol name.
14049 H -- print a memory address offset by 8; used for sse high-parts
14050 Y -- print condition for XOP pcom* instruction.
14051 + -- print a branch hint as 'cs' or 'ds' prefix
14052 ; -- print a semicolon (after prefixes due to bug in older gas).
14053 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
14054 @ -- print a segment register of thread base pointer load
14055 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
14058 void
14059 ix86_print_operand (FILE *file, rtx x, int code)
14061 if (code)
14063 switch (code)
14065 case 'A':
14066 switch (ASSEMBLER_DIALECT)
14068 case ASM_ATT:
14069 putc ('*', file);
14070 break;
14072 case ASM_INTEL:
14073 /* Intel syntax. For absolute addresses, registers should not
14074 be surrounded by braces. */
14075 if (!REG_P (x))
14077 putc ('[', file);
14078 ix86_print_operand (file, x, 0);
14079 putc (']', file);
14080 return;
14082 break;
14084 default:
14085 gcc_unreachable ();
14088 ix86_print_operand (file, x, 0);
14089 return;
14091 case 'E':
14092 /* Wrap address in an UNSPEC to declare special handling. */
14093 if (TARGET_64BIT)
14094 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
14096 output_address (x);
14097 return;
14099 case 'L':
14100 if (ASSEMBLER_DIALECT == ASM_ATT)
14101 putc ('l', file);
14102 return;
14104 case 'W':
14105 if (ASSEMBLER_DIALECT == ASM_ATT)
14106 putc ('w', file);
14107 return;
14109 case 'B':
14110 if (ASSEMBLER_DIALECT == ASM_ATT)
14111 putc ('b', file);
14112 return;
14114 case 'Q':
14115 if (ASSEMBLER_DIALECT == ASM_ATT)
14116 putc ('l', file);
14117 return;
14119 case 'S':
14120 if (ASSEMBLER_DIALECT == ASM_ATT)
14121 putc ('s', file);
14122 return;
14124 case 'T':
14125 if (ASSEMBLER_DIALECT == ASM_ATT)
14126 putc ('t', file);
14127 return;
14129 case 'O':
14130 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14131 if (ASSEMBLER_DIALECT != ASM_ATT)
14132 return;
14134 switch (GET_MODE_SIZE (GET_MODE (x)))
14136 case 2:
14137 putc ('w', file);
14138 break;
14140 case 4:
14141 putc ('l', file);
14142 break;
14144 case 8:
14145 putc ('q', file);
14146 break;
14148 default:
14149 output_operand_lossage
14150 ("invalid operand size for operand code 'O'");
14151 return;
14154 putc ('.', file);
14155 #endif
14156 return;
14158 case 'z':
14159 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14161 /* Opcodes don't get size suffixes if using Intel opcodes. */
14162 if (ASSEMBLER_DIALECT == ASM_INTEL)
14163 return;
14165 switch (GET_MODE_SIZE (GET_MODE (x)))
14167 case 1:
14168 putc ('b', file);
14169 return;
14171 case 2:
14172 putc ('w', file);
14173 return;
14175 case 4:
14176 putc ('l', file);
14177 return;
14179 case 8:
14180 putc ('q', file);
14181 return;
14183 default:
14184 output_operand_lossage
14185 ("invalid operand size for operand code 'z'");
14186 return;
14190 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14191 warning
14192 (0, "non-integer operand used with operand code 'z'");
14193 /* FALLTHRU */
14195 case 'Z':
14196 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
14197 if (ASSEMBLER_DIALECT == ASM_INTEL)
14198 return;
14200 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14202 switch (GET_MODE_SIZE (GET_MODE (x)))
14204 case 2:
14205 #ifdef HAVE_AS_IX86_FILDS
14206 putc ('s', file);
14207 #endif
14208 return;
14210 case 4:
14211 putc ('l', file);
14212 return;
14214 case 8:
14215 #ifdef HAVE_AS_IX86_FILDQ
14216 putc ('q', file);
14217 #else
14218 fputs ("ll", file);
14219 #endif
14220 return;
14222 default:
14223 break;
14226 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14228 /* 387 opcodes don't get size suffixes
14229 if the operands are registers. */
14230 if (STACK_REG_P (x))
14231 return;
14233 switch (GET_MODE_SIZE (GET_MODE (x)))
14235 case 4:
14236 putc ('s', file);
14237 return;
14239 case 8:
14240 putc ('l', file);
14241 return;
14243 case 12:
14244 case 16:
14245 putc ('t', file);
14246 return;
14248 default:
14249 break;
14252 else
14254 output_operand_lossage
14255 ("invalid operand type used with operand code 'Z'");
14256 return;
14259 output_operand_lossage
14260 ("invalid operand size for operand code 'Z'");
14261 return;
14263 case 'd':
14264 case 'b':
14265 case 'w':
14266 case 'k':
14267 case 'q':
14268 case 'h':
14269 case 't':
14270 case 'y':
14271 case 'x':
14272 case 'X':
14273 case 'P':
14274 case 'p':
14275 break;
14277 case 's':
14278 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
14280 ix86_print_operand (file, x, 0);
14281 fputs (", ", file);
14283 return;
14285 case 'Y':
14286 switch (GET_CODE (x))
14288 case NE:
14289 fputs ("neq", file);
14290 break;
14291 case EQ:
14292 fputs ("eq", file);
14293 break;
14294 case GE:
14295 case GEU:
14296 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14297 break;
14298 case GT:
14299 case GTU:
14300 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14301 break;
14302 case LE:
14303 case LEU:
14304 fputs ("le", file);
14305 break;
14306 case LT:
14307 case LTU:
14308 fputs ("lt", file);
14309 break;
14310 case UNORDERED:
14311 fputs ("unord", file);
14312 break;
14313 case ORDERED:
14314 fputs ("ord", file);
14315 break;
14316 case UNEQ:
14317 fputs ("ueq", file);
14318 break;
14319 case UNGE:
14320 fputs ("nlt", file);
14321 break;
14322 case UNGT:
14323 fputs ("nle", file);
14324 break;
14325 case UNLE:
14326 fputs ("ule", file);
14327 break;
14328 case UNLT:
14329 fputs ("ult", file);
14330 break;
14331 case LTGT:
14332 fputs ("une", file);
14333 break;
14334 default:
14335 output_operand_lossage ("operand is not a condition code, "
14336 "invalid operand code 'Y'");
14337 return;
14339 return;
14341 case 'D':
14342 /* Little bit of braindamage here. The SSE compare instructions
14343 does use completely different names for the comparisons that the
14344 fp conditional moves. */
14345 switch (GET_CODE (x))
14347 case UNEQ:
14348 if (TARGET_AVX)
14350 fputs ("eq_us", file);
14351 break;
14353 case EQ:
14354 fputs ("eq", file);
14355 break;
14356 case UNLT:
14357 if (TARGET_AVX)
14359 fputs ("nge", file);
14360 break;
14362 case LT:
14363 fputs ("lt", file);
14364 break;
14365 case UNLE:
14366 if (TARGET_AVX)
14368 fputs ("ngt", file);
14369 break;
14371 case LE:
14372 fputs ("le", file);
14373 break;
14374 case UNORDERED:
14375 fputs ("unord", file);
14376 break;
14377 case LTGT:
14378 if (TARGET_AVX)
14380 fputs ("neq_oq", file);
14381 break;
14383 case NE:
14384 fputs ("neq", file);
14385 break;
14386 case GE:
14387 if (TARGET_AVX)
14389 fputs ("ge", file);
14390 break;
14392 case UNGE:
14393 fputs ("nlt", file);
14394 break;
14395 case GT:
14396 if (TARGET_AVX)
14398 fputs ("gt", file);
14399 break;
14401 case UNGT:
14402 fputs ("nle", file);
14403 break;
14404 case ORDERED:
14405 fputs ("ord", file);
14406 break;
14407 default:
14408 output_operand_lossage ("operand is not a condition code, "
14409 "invalid operand code 'D'");
14410 return;
14412 return;
14414 case 'F':
14415 case 'f':
14416 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14417 if (ASSEMBLER_DIALECT == ASM_ATT)
14418 putc ('.', file);
14419 #endif
14421 case 'C':
14422 case 'c':
14423 if (!COMPARISON_P (x))
14425 output_operand_lossage ("operand is not a condition code, "
14426 "invalid operand code '%c'", code);
14427 return;
14429 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
14430 code == 'c' || code == 'f',
14431 code == 'F' || code == 'f',
14432 file);
14433 return;
14435 case 'H':
14436 if (!offsettable_memref_p (x))
14438 output_operand_lossage ("operand is not an offsettable memory "
14439 "reference, invalid operand code 'H'");
14440 return;
14442 /* It doesn't actually matter what mode we use here, as we're
14443 only going to use this for printing. */
14444 x = adjust_address_nv (x, DImode, 8);
14445 break;
14447 case 'K':
14448 gcc_assert (CONST_INT_P (x));
14450 if (INTVAL (x) & IX86_HLE_ACQUIRE)
14451 #ifdef HAVE_AS_IX86_HLE
14452 fputs ("xacquire ", file);
14453 #else
14454 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
14455 #endif
14456 else if (INTVAL (x) & IX86_HLE_RELEASE)
14457 #ifdef HAVE_AS_IX86_HLE
14458 fputs ("xrelease ", file);
14459 #else
14460 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
14461 #endif
14462 /* We do not want to print value of the operand. */
14463 return;
14465 case '*':
14466 if (ASSEMBLER_DIALECT == ASM_ATT)
14467 putc ('*', file);
14468 return;
14470 case '&':
14472 const char *name = get_some_local_dynamic_name ();
14473 if (name == NULL)
14474 output_operand_lossage ("'%%&' used without any "
14475 "local dynamic TLS references");
14476 else
14477 assemble_name (file, name);
14478 return;
14481 case '+':
14483 rtx x;
14485 if (!optimize
14486 || optimize_function_for_size_p (cfun)
14487 || !TARGET_BRANCH_PREDICTION_HINTS)
14488 return;
14490 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
14491 if (x)
14493 int pred_val = INTVAL (XEXP (x, 0));
14495 if (pred_val < REG_BR_PROB_BASE * 45 / 100
14496 || pred_val > REG_BR_PROB_BASE * 55 / 100)
14498 bool taken = pred_val > REG_BR_PROB_BASE / 2;
14499 bool cputaken
14500 = final_forward_branch_p (current_output_insn) == 0;
14502 /* Emit hints only in the case default branch prediction
14503 heuristics would fail. */
14504 if (taken != cputaken)
14506 /* We use 3e (DS) prefix for taken branches and
14507 2e (CS) prefix for not taken branches. */
14508 if (taken)
14509 fputs ("ds ; ", file);
14510 else
14511 fputs ("cs ; ", file);
14515 return;
14518 case ';':
14519 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
14520 putc (';', file);
14521 #endif
14522 return;
14524 case '@':
14525 if (ASSEMBLER_DIALECT == ASM_ATT)
14526 putc ('%', file);
14528 /* The kernel uses a different segment register for performance
14529 reasons; a system call would not have to trash the userspace
14530 segment register, which would be expensive. */
14531 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
14532 fputs ("fs", file);
14533 else
14534 fputs ("gs", file);
14535 return;
14537 case '~':
14538 putc (TARGET_AVX2 ? 'i' : 'f', file);
14539 return;
14541 case '^':
14542 if (TARGET_64BIT && Pmode != word_mode)
14543 fputs ("addr32 ", file);
14544 return;
14546 default:
14547 output_operand_lossage ("invalid operand code '%c'", code);
14551 if (REG_P (x))
14552 print_reg (x, code, file);
14554 else if (MEM_P (x))
14556 /* No `byte ptr' prefix for call instructions or BLKmode operands. */
14557 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
14558 && GET_MODE (x) != BLKmode)
14560 const char * size;
14561 switch (GET_MODE_SIZE (GET_MODE (x)))
14563 case 1: size = "BYTE"; break;
14564 case 2: size = "WORD"; break;
14565 case 4: size = "DWORD"; break;
14566 case 8: size = "QWORD"; break;
14567 case 12: size = "TBYTE"; break;
14568 case 16:
14569 if (GET_MODE (x) == XFmode)
14570 size = "TBYTE";
14571 else
14572 size = "XMMWORD";
14573 break;
14574 case 32: size = "YMMWORD"; break;
14575 default:
14576 gcc_unreachable ();
14579 /* Check for explicit size override (codes 'b', 'w', 'k',
14580 'q' and 'x') */
14581 if (code == 'b')
14582 size = "BYTE";
14583 else if (code == 'w')
14584 size = "WORD";
14585 else if (code == 'k')
14586 size = "DWORD";
14587 else if (code == 'q')
14588 size = "QWORD";
14589 else if (code == 'x')
14590 size = "XMMWORD";
14592 fputs (size, file);
14593 fputs (" PTR ", file);
14596 x = XEXP (x, 0);
14597 /* Avoid (%rip) for call operands. */
14598 if (CONSTANT_ADDRESS_P (x) && code == 'P'
14599 && !CONST_INT_P (x))
14600 output_addr_const (file, x);
14601 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
14602 output_operand_lossage ("invalid constraints for operand");
14603 else
14604 output_address (x);
14607 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
14609 REAL_VALUE_TYPE r;
14610 long l;
14612 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14613 REAL_VALUE_TO_TARGET_SINGLE (r, l);
14615 if (ASSEMBLER_DIALECT == ASM_ATT)
14616 putc ('$', file);
14617 /* Sign extend 32bit SFmode immediate to 8 bytes. */
14618 if (code == 'q')
14619 fprintf (file, "0x%08llx", (unsigned long long) (int) l);
14620 else
14621 fprintf (file, "0x%08x", (unsigned int) l);
14624 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
14626 REAL_VALUE_TYPE r;
14627 long l[2];
14629 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14630 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
14632 if (ASSEMBLER_DIALECT == ASM_ATT)
14633 putc ('$', file);
14634 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
14637 /* These float cases don't actually occur as immediate operands. */
14638 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
14640 char dstr[30];
14642 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
14643 fputs (dstr, file);
14646 else
14648 /* We have patterns that allow zero sets of memory, for instance.
14649 In 64-bit mode, we should probably support all 8-byte vectors,
14650 since we can in fact encode that into an immediate. */
14651 if (GET_CODE (x) == CONST_VECTOR)
14653 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
14654 x = const0_rtx;
14657 if (code != 'P' && code != 'p')
14659 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
14661 if (ASSEMBLER_DIALECT == ASM_ATT)
14662 putc ('$', file);
14664 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
14665 || GET_CODE (x) == LABEL_REF)
14667 if (ASSEMBLER_DIALECT == ASM_ATT)
14668 putc ('$', file);
14669 else
14670 fputs ("OFFSET FLAT:", file);
14673 if (CONST_INT_P (x))
14674 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
14675 else if (flag_pic || MACHOPIC_INDIRECT)
14676 output_pic_addr_const (file, x, code);
14677 else
14678 output_addr_const (file, x);
14682 static bool
14683 ix86_print_operand_punct_valid_p (unsigned char code)
14685 return (code == '@' || code == '*' || code == '+' || code == '&'
14686 || code == ';' || code == '~' || code == '^');
14689 /* Print a memory operand whose address is ADDR. */
14691 static void
14692 ix86_print_operand_address (FILE *file, rtx addr)
14694 struct ix86_address parts;
14695 rtx base, index, disp;
14696 int scale;
14697 int ok;
14698 bool vsib = false;
14699 int code = 0;
14701 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
14703 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14704 gcc_assert (parts.index == NULL_RTX);
14705 parts.index = XVECEXP (addr, 0, 1);
14706 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
14707 addr = XVECEXP (addr, 0, 0);
14708 vsib = true;
14710 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
14712 gcc_assert (TARGET_64BIT);
14713 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14714 code = 'q';
14716 else
14717 ok = ix86_decompose_address (addr, &parts);
14719 gcc_assert (ok);
14721 base = parts.base;
14722 index = parts.index;
14723 disp = parts.disp;
14724 scale = parts.scale;
14726 switch (parts.seg)
14728 case SEG_DEFAULT:
14729 break;
14730 case SEG_FS:
14731 case SEG_GS:
14732 if (ASSEMBLER_DIALECT == ASM_ATT)
14733 putc ('%', file);
14734 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
14735 break;
14736 default:
14737 gcc_unreachable ();
14740 /* Use one byte shorter RIP relative addressing for 64bit mode. */
14741 if (TARGET_64BIT && !base && !index)
14743 rtx symbol = disp;
14745 if (GET_CODE (disp) == CONST
14746 && GET_CODE (XEXP (disp, 0)) == PLUS
14747 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14748 symbol = XEXP (XEXP (disp, 0), 0);
14750 if (GET_CODE (symbol) == LABEL_REF
14751 || (GET_CODE (symbol) == SYMBOL_REF
14752 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
14753 base = pc_rtx;
14755 if (!base && !index)
14757 /* Displacement only requires special attention. */
14759 if (CONST_INT_P (disp))
14761 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
14762 fputs ("ds:", file);
14763 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
14765 else if (flag_pic)
14766 output_pic_addr_const (file, disp, 0);
14767 else
14768 output_addr_const (file, disp);
14770 else
14772 /* Print SImode register names to force addr32 prefix. */
14773 if (SImode_address_operand (addr, VOIDmode))
14775 #ifdef ENABLE_CHECKING
14776 gcc_assert (TARGET_64BIT);
14777 switch (GET_CODE (addr))
14779 case SUBREG:
14780 gcc_assert (GET_MODE (addr) == SImode);
14781 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
14782 break;
14783 case ZERO_EXTEND:
14784 case AND:
14785 gcc_assert (GET_MODE (addr) == DImode);
14786 break;
14787 default:
14788 gcc_unreachable ();
14790 #endif
14791 gcc_assert (!code);
14792 code = 'k';
14794 else if (code == 0
14795 && TARGET_X32
14796 && disp
14797 && CONST_INT_P (disp)
14798 && INTVAL (disp) < -16*1024*1024)
14800 /* X32 runs in 64-bit mode, where displacement, DISP, in
14801 address DISP(%r64), is encoded as 32-bit immediate sign-
14802 extended from 32-bit to 64-bit. For -0x40000300(%r64),
14803 address is %r64 + 0xffffffffbffffd00. When %r64 <
14804 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
14805 which is invalid for x32. The correct address is %r64
14806 - 0x40000300 == 0xf7ffdd64. To properly encode
14807 -0x40000300(%r64) for x32, we zero-extend negative
14808 displacement by forcing addr32 prefix which truncates
14809 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
14810 zero-extend all negative displacements, including -1(%rsp).
14811 However, for small negative displacements, sign-extension
14812 won't cause overflow. We only zero-extend negative
14813 displacements if they < -16*1024*1024, which is also used
14814 to check legitimate address displacements for PIC. */
14815 code = 'k';
14818 if (ASSEMBLER_DIALECT == ASM_ATT)
14820 if (disp)
14822 if (flag_pic)
14823 output_pic_addr_const (file, disp, 0);
14824 else if (GET_CODE (disp) == LABEL_REF)
14825 output_asm_label (disp);
14826 else
14827 output_addr_const (file, disp);
14830 putc ('(', file);
14831 if (base)
14832 print_reg (base, code, file);
14833 if (index)
14835 putc (',', file);
14836 print_reg (index, vsib ? 0 : code, file);
14837 if (scale != 1 || vsib)
14838 fprintf (file, ",%d", scale);
14840 putc (')', file);
14842 else
14844 rtx offset = NULL_RTX;
14846 if (disp)
14848 /* Pull out the offset of a symbol; print any symbol itself. */
14849 if (GET_CODE (disp) == CONST
14850 && GET_CODE (XEXP (disp, 0)) == PLUS
14851 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14853 offset = XEXP (XEXP (disp, 0), 1);
14854 disp = gen_rtx_CONST (VOIDmode,
14855 XEXP (XEXP (disp, 0), 0));
14858 if (flag_pic)
14859 output_pic_addr_const (file, disp, 0);
14860 else if (GET_CODE (disp) == LABEL_REF)
14861 output_asm_label (disp);
14862 else if (CONST_INT_P (disp))
14863 offset = disp;
14864 else
14865 output_addr_const (file, disp);
14868 putc ('[', file);
14869 if (base)
14871 print_reg (base, code, file);
14872 if (offset)
14874 if (INTVAL (offset) >= 0)
14875 putc ('+', file);
14876 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14879 else if (offset)
14880 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14881 else
14882 putc ('0', file);
14884 if (index)
14886 putc ('+', file);
14887 print_reg (index, vsib ? 0 : code, file);
14888 if (scale != 1 || vsib)
14889 fprintf (file, "*%d", scale);
14891 putc (']', file);
14896 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
14898 static bool
14899 i386_asm_output_addr_const_extra (FILE *file, rtx x)
14901 rtx op;
14903 if (GET_CODE (x) != UNSPEC)
14904 return false;
14906 op = XVECEXP (x, 0, 0);
14907 switch (XINT (x, 1))
14909 case UNSPEC_GOTTPOFF:
14910 output_addr_const (file, op);
14911 /* FIXME: This might be @TPOFF in Sun ld. */
14912 fputs ("@gottpoff", file);
14913 break;
14914 case UNSPEC_TPOFF:
14915 output_addr_const (file, op);
14916 fputs ("@tpoff", file);
14917 break;
14918 case UNSPEC_NTPOFF:
14919 output_addr_const (file, op);
14920 if (TARGET_64BIT)
14921 fputs ("@tpoff", file);
14922 else
14923 fputs ("@ntpoff", file);
14924 break;
14925 case UNSPEC_DTPOFF:
14926 output_addr_const (file, op);
14927 fputs ("@dtpoff", file);
14928 break;
14929 case UNSPEC_GOTNTPOFF:
14930 output_addr_const (file, op);
14931 if (TARGET_64BIT)
14932 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
14933 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
14934 else
14935 fputs ("@gotntpoff", file);
14936 break;
14937 case UNSPEC_INDNTPOFF:
14938 output_addr_const (file, op);
14939 fputs ("@indntpoff", file);
14940 break;
14941 #if TARGET_MACHO
14942 case UNSPEC_MACHOPIC_OFFSET:
14943 output_addr_const (file, op);
14944 putc ('-', file);
14945 machopic_output_function_base_name (file);
14946 break;
14947 #endif
14949 case UNSPEC_STACK_CHECK:
14951 int offset;
14953 gcc_assert (flag_split_stack);
14955 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
14956 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
14957 #else
14958 gcc_unreachable ();
14959 #endif
14961 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
14963 break;
14965 default:
14966 return false;
14969 return true;
14972 /* Split one or more double-mode RTL references into pairs of half-mode
14973 references. The RTL can be REG, offsettable MEM, integer constant, or
14974 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
14975 split and "num" is its length. lo_half and hi_half are output arrays
14976 that parallel "operands". */
14978 void
14979 split_double_mode (enum machine_mode mode, rtx operands[],
14980 int num, rtx lo_half[], rtx hi_half[])
14982 enum machine_mode half_mode;
14983 unsigned int byte;
14985 switch (mode)
14987 case TImode:
14988 half_mode = DImode;
14989 break;
14990 case DImode:
14991 half_mode = SImode;
14992 break;
14993 default:
14994 gcc_unreachable ();
14997 byte = GET_MODE_SIZE (half_mode);
14999 while (num--)
15001 rtx op = operands[num];
15003 /* simplify_subreg refuse to split volatile memory addresses,
15004 but we still have to handle it. */
15005 if (MEM_P (op))
15007 lo_half[num] = adjust_address (op, half_mode, 0);
15008 hi_half[num] = adjust_address (op, half_mode, byte);
15010 else
15012 lo_half[num] = simplify_gen_subreg (half_mode, op,
15013 GET_MODE (op) == VOIDmode
15014 ? mode : GET_MODE (op), 0);
15015 hi_half[num] = simplify_gen_subreg (half_mode, op,
15016 GET_MODE (op) == VOIDmode
15017 ? mode : GET_MODE (op), byte);
15022 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
15023 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
15024 is the expression of the binary operation. The output may either be
15025 emitted here, or returned to the caller, like all output_* functions.
15027 There is no guarantee that the operands are the same mode, as they
15028 might be within FLOAT or FLOAT_EXTEND expressions. */
15030 #ifndef SYSV386_COMPAT
15031 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
15032 wants to fix the assemblers because that causes incompatibility
15033 with gcc. No-one wants to fix gcc because that causes
15034 incompatibility with assemblers... You can use the option of
15035 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
15036 #define SYSV386_COMPAT 1
15037 #endif
15039 const char *
15040 output_387_binary_op (rtx insn, rtx *operands)
15042 static char buf[40];
15043 const char *p;
15044 const char *ssep;
15045 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
15047 #ifdef ENABLE_CHECKING
15048 /* Even if we do not want to check the inputs, this documents input
15049 constraints. Which helps in understanding the following code. */
15050 if (STACK_REG_P (operands[0])
15051 && ((REG_P (operands[1])
15052 && REGNO (operands[0]) == REGNO (operands[1])
15053 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
15054 || (REG_P (operands[2])
15055 && REGNO (operands[0]) == REGNO (operands[2])
15056 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
15057 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
15058 ; /* ok */
15059 else
15060 gcc_assert (is_sse);
15061 #endif
15063 switch (GET_CODE (operands[3]))
15065 case PLUS:
15066 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15067 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15068 p = "fiadd";
15069 else
15070 p = "fadd";
15071 ssep = "vadd";
15072 break;
15074 case MINUS:
15075 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15076 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15077 p = "fisub";
15078 else
15079 p = "fsub";
15080 ssep = "vsub";
15081 break;
15083 case MULT:
15084 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15085 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15086 p = "fimul";
15087 else
15088 p = "fmul";
15089 ssep = "vmul";
15090 break;
15092 case DIV:
15093 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15094 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15095 p = "fidiv";
15096 else
15097 p = "fdiv";
15098 ssep = "vdiv";
15099 break;
15101 default:
15102 gcc_unreachable ();
15105 if (is_sse)
15107 if (TARGET_AVX)
15109 strcpy (buf, ssep);
15110 if (GET_MODE (operands[0]) == SFmode)
15111 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
15112 else
15113 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
15115 else
15117 strcpy (buf, ssep + 1);
15118 if (GET_MODE (operands[0]) == SFmode)
15119 strcat (buf, "ss\t{%2, %0|%0, %2}");
15120 else
15121 strcat (buf, "sd\t{%2, %0|%0, %2}");
15123 return buf;
15125 strcpy (buf, p);
15127 switch (GET_CODE (operands[3]))
15129 case MULT:
15130 case PLUS:
15131 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
15133 rtx temp = operands[2];
15134 operands[2] = operands[1];
15135 operands[1] = temp;
15138 /* know operands[0] == operands[1]. */
15140 if (MEM_P (operands[2]))
15142 p = "%Z2\t%2";
15143 break;
15146 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15148 if (STACK_TOP_P (operands[0]))
15149 /* How is it that we are storing to a dead operand[2]?
15150 Well, presumably operands[1] is dead too. We can't
15151 store the result to st(0) as st(0) gets popped on this
15152 instruction. Instead store to operands[2] (which I
15153 think has to be st(1)). st(1) will be popped later.
15154 gcc <= 2.8.1 didn't have this check and generated
15155 assembly code that the Unixware assembler rejected. */
15156 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15157 else
15158 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15159 break;
15162 if (STACK_TOP_P (operands[0]))
15163 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15164 else
15165 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15166 break;
15168 case MINUS:
15169 case DIV:
15170 if (MEM_P (operands[1]))
15172 p = "r%Z1\t%1";
15173 break;
15176 if (MEM_P (operands[2]))
15178 p = "%Z2\t%2";
15179 break;
15182 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15184 #if SYSV386_COMPAT
15185 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
15186 derived assemblers, confusingly reverse the direction of
15187 the operation for fsub{r} and fdiv{r} when the
15188 destination register is not st(0). The Intel assembler
15189 doesn't have this brain damage. Read !SYSV386_COMPAT to
15190 figure out what the hardware really does. */
15191 if (STACK_TOP_P (operands[0]))
15192 p = "{p\t%0, %2|rp\t%2, %0}";
15193 else
15194 p = "{rp\t%2, %0|p\t%0, %2}";
15195 #else
15196 if (STACK_TOP_P (operands[0]))
15197 /* As above for fmul/fadd, we can't store to st(0). */
15198 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15199 else
15200 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15201 #endif
15202 break;
15205 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
15207 #if SYSV386_COMPAT
15208 if (STACK_TOP_P (operands[0]))
15209 p = "{rp\t%0, %1|p\t%1, %0}";
15210 else
15211 p = "{p\t%1, %0|rp\t%0, %1}";
15212 #else
15213 if (STACK_TOP_P (operands[0]))
15214 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
15215 else
15216 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
15217 #endif
15218 break;
15221 if (STACK_TOP_P (operands[0]))
15223 if (STACK_TOP_P (operands[1]))
15224 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15225 else
15226 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
15227 break;
15229 else if (STACK_TOP_P (operands[1]))
15231 #if SYSV386_COMPAT
15232 p = "{\t%1, %0|r\t%0, %1}";
15233 #else
15234 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
15235 #endif
15237 else
15239 #if SYSV386_COMPAT
15240 p = "{r\t%2, %0|\t%0, %2}";
15241 #else
15242 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15243 #endif
15245 break;
15247 default:
15248 gcc_unreachable ();
15251 strcat (buf, p);
15252 return buf;
15255 /* Check if a 256bit AVX register is referenced inside of EXP. */
15257 static int
15258 ix86_check_avx256_register (rtx *pexp, void *data ATTRIBUTE_UNUSED)
15260 rtx exp = *pexp;
15262 if (GET_CODE (exp) == SUBREG)
15263 exp = SUBREG_REG (exp);
15265 if (REG_P (exp)
15266 && VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp)))
15267 return 1;
15269 return 0;
15272 /* Return needed mode for entity in optimize_mode_switching pass. */
15274 static int
15275 ix86_avx_u128_mode_needed (rtx insn)
15277 if (CALL_P (insn))
15279 rtx link;
15281 /* Needed mode is set to AVX_U128_CLEAN if there are
15282 no 256bit modes used in function arguments. */
15283 for (link = CALL_INSN_FUNCTION_USAGE (insn);
15284 link;
15285 link = XEXP (link, 1))
15287 if (GET_CODE (XEXP (link, 0)) == USE)
15289 rtx arg = XEXP (XEXP (link, 0), 0);
15291 if (ix86_check_avx256_register (&arg, NULL))
15292 return AVX_U128_ANY;
15296 return AVX_U128_CLEAN;
15299 /* Require DIRTY mode if a 256bit AVX register is referenced. Hardware
15300 changes state only when a 256bit register is written to, but we need
15301 to prevent the compiler from moving optimal insertion point above
15302 eventual read from 256bit register. */
15303 if (for_each_rtx (&PATTERN (insn), ix86_check_avx256_register, NULL))
15304 return AVX_U128_DIRTY;
15306 return AVX_U128_ANY;
15309 /* Return mode that i387 must be switched into
15310 prior to the execution of insn. */
15312 static int
15313 ix86_i387_mode_needed (int entity, rtx insn)
15315 enum attr_i387_cw mode;
15317 /* The mode UNINITIALIZED is used to store control word after a
15318 function call or ASM pattern. The mode ANY specify that function
15319 has no requirements on the control word and make no changes in the
15320 bits we are interested in. */
15322 if (CALL_P (insn)
15323 || (NONJUMP_INSN_P (insn)
15324 && (asm_noperands (PATTERN (insn)) >= 0
15325 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
15326 return I387_CW_UNINITIALIZED;
15328 if (recog_memoized (insn) < 0)
15329 return I387_CW_ANY;
15331 mode = get_attr_i387_cw (insn);
15333 switch (entity)
15335 case I387_TRUNC:
15336 if (mode == I387_CW_TRUNC)
15337 return mode;
15338 break;
15340 case I387_FLOOR:
15341 if (mode == I387_CW_FLOOR)
15342 return mode;
15343 break;
15345 case I387_CEIL:
15346 if (mode == I387_CW_CEIL)
15347 return mode;
15348 break;
15350 case I387_MASK_PM:
15351 if (mode == I387_CW_MASK_PM)
15352 return mode;
15353 break;
15355 default:
15356 gcc_unreachable ();
15359 return I387_CW_ANY;
15362 /* Return mode that entity must be switched into
15363 prior to the execution of insn. */
15366 ix86_mode_needed (int entity, rtx insn)
15368 switch (entity)
15370 case AVX_U128:
15371 return ix86_avx_u128_mode_needed (insn);
15372 case I387_TRUNC:
15373 case I387_FLOOR:
15374 case I387_CEIL:
15375 case I387_MASK_PM:
15376 return ix86_i387_mode_needed (entity, insn);
15377 default:
15378 gcc_unreachable ();
15380 return 0;
15383 /* Check if a 256bit AVX register is referenced in stores. */
15385 static void
15386 ix86_check_avx256_stores (rtx dest, const_rtx set ATTRIBUTE_UNUSED, void *data)
15388 if (ix86_check_avx256_register (&dest, NULL))
15390 bool *used = (bool *) data;
15391 *used = true;
15395 /* Calculate mode of upper 128bit AVX registers after the insn. */
15397 static int
15398 ix86_avx_u128_mode_after (int mode, rtx insn)
15400 rtx pat = PATTERN (insn);
15402 if (vzeroupper_operation (pat, VOIDmode)
15403 || vzeroall_operation (pat, VOIDmode))
15404 return AVX_U128_CLEAN;
15406 /* We know that state is clean after CALL insn if there are no
15407 256bit registers used in the function return register. */
15408 if (CALL_P (insn))
15410 bool avx_reg256_found = false;
15411 note_stores (pat, ix86_check_avx256_stores, &avx_reg256_found);
15412 if (!avx_reg256_found)
15413 return AVX_U128_CLEAN;
15416 /* Otherwise, return current mode. Remember that if insn
15417 references AVX 256bit registers, the mode was already changed
15418 to DIRTY from MODE_NEEDED. */
15419 return mode;
15422 /* Return the mode that an insn results in. */
15425 ix86_mode_after (int entity, int mode, rtx insn)
15427 switch (entity)
15429 case AVX_U128:
15430 return ix86_avx_u128_mode_after (mode, insn);
15431 case I387_TRUNC:
15432 case I387_FLOOR:
15433 case I387_CEIL:
15434 case I387_MASK_PM:
15435 return mode;
15436 default:
15437 gcc_unreachable ();
15441 static int
15442 ix86_avx_u128_mode_entry (void)
15444 tree arg;
15446 /* Entry mode is set to AVX_U128_DIRTY if there are
15447 256bit modes used in function arguments. */
15448 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
15449 arg = TREE_CHAIN (arg))
15451 rtx incoming = DECL_INCOMING_RTL (arg);
15453 if (incoming && ix86_check_avx256_register (&incoming, NULL))
15454 return AVX_U128_DIRTY;
15457 return AVX_U128_CLEAN;
15460 /* Return a mode that ENTITY is assumed to be
15461 switched to at function entry. */
15464 ix86_mode_entry (int entity)
15466 switch (entity)
15468 case AVX_U128:
15469 return ix86_avx_u128_mode_entry ();
15470 case I387_TRUNC:
15471 case I387_FLOOR:
15472 case I387_CEIL:
15473 case I387_MASK_PM:
15474 return I387_CW_ANY;
15475 default:
15476 gcc_unreachable ();
15480 static int
15481 ix86_avx_u128_mode_exit (void)
15483 rtx reg = crtl->return_rtx;
15485 /* Exit mode is set to AVX_U128_DIRTY if there are
15486 256bit modes used in the function return register. */
15487 if (reg && ix86_check_avx256_register (&reg, NULL))
15488 return AVX_U128_DIRTY;
15490 return AVX_U128_CLEAN;
15493 /* Return a mode that ENTITY is assumed to be
15494 switched to at function exit. */
15497 ix86_mode_exit (int entity)
15499 switch (entity)
15501 case AVX_U128:
15502 return ix86_avx_u128_mode_exit ();
15503 case I387_TRUNC:
15504 case I387_FLOOR:
15505 case I387_CEIL:
15506 case I387_MASK_PM:
15507 return I387_CW_ANY;
15508 default:
15509 gcc_unreachable ();
15513 /* Output code to initialize control word copies used by trunc?f?i and
15514 rounding patterns. CURRENT_MODE is set to current control word,
15515 while NEW_MODE is set to new control word. */
15517 static void
15518 emit_i387_cw_initialization (int mode)
15520 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
15521 rtx new_mode;
15523 enum ix86_stack_slot slot;
15525 rtx reg = gen_reg_rtx (HImode);
15527 emit_insn (gen_x86_fnstcw_1 (stored_mode));
15528 emit_move_insn (reg, copy_rtx (stored_mode));
15530 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
15531 || optimize_function_for_size_p (cfun))
15533 switch (mode)
15535 case I387_CW_TRUNC:
15536 /* round toward zero (truncate) */
15537 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
15538 slot = SLOT_CW_TRUNC;
15539 break;
15541 case I387_CW_FLOOR:
15542 /* round down toward -oo */
15543 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15544 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
15545 slot = SLOT_CW_FLOOR;
15546 break;
15548 case I387_CW_CEIL:
15549 /* round up toward +oo */
15550 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15551 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
15552 slot = SLOT_CW_CEIL;
15553 break;
15555 case I387_CW_MASK_PM:
15556 /* mask precision exception for nearbyint() */
15557 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15558 slot = SLOT_CW_MASK_PM;
15559 break;
15561 default:
15562 gcc_unreachable ();
15565 else
15567 switch (mode)
15569 case I387_CW_TRUNC:
15570 /* round toward zero (truncate) */
15571 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
15572 slot = SLOT_CW_TRUNC;
15573 break;
15575 case I387_CW_FLOOR:
15576 /* round down toward -oo */
15577 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
15578 slot = SLOT_CW_FLOOR;
15579 break;
15581 case I387_CW_CEIL:
15582 /* round up toward +oo */
15583 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
15584 slot = SLOT_CW_CEIL;
15585 break;
15587 case I387_CW_MASK_PM:
15588 /* mask precision exception for nearbyint() */
15589 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15590 slot = SLOT_CW_MASK_PM;
15591 break;
15593 default:
15594 gcc_unreachable ();
15598 gcc_assert (slot < MAX_386_STACK_LOCALS);
15600 new_mode = assign_386_stack_local (HImode, slot);
15601 emit_move_insn (new_mode, reg);
15604 /* Emit vzeroupper. */
15606 void
15607 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
15609 int i;
15611 /* Cancel automatic vzeroupper insertion if there are
15612 live call-saved SSE registers at the insertion point. */
15614 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
15615 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
15616 return;
15618 if (TARGET_64BIT)
15619 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
15620 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
15621 return;
15623 emit_insn (gen_avx_vzeroupper ());
15626 /* Generate one or more insns to set ENTITY to MODE. */
15628 void
15629 ix86_emit_mode_set (int entity, int mode, HARD_REG_SET regs_live)
15631 switch (entity)
15633 case AVX_U128:
15634 if (mode == AVX_U128_CLEAN)
15635 ix86_avx_emit_vzeroupper (regs_live);
15636 break;
15637 case I387_TRUNC:
15638 case I387_FLOOR:
15639 case I387_CEIL:
15640 case I387_MASK_PM:
15641 if (mode != I387_CW_ANY
15642 && mode != I387_CW_UNINITIALIZED)
15643 emit_i387_cw_initialization (mode);
15644 break;
15645 default:
15646 gcc_unreachable ();
15650 /* Output code for INSN to convert a float to a signed int. OPERANDS
15651 are the insn operands. The output may be [HSD]Imode and the input
15652 operand may be [SDX]Fmode. */
15654 const char *
15655 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
15657 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15658 int dimode_p = GET_MODE (operands[0]) == DImode;
15659 int round_mode = get_attr_i387_cw (insn);
15661 /* Jump through a hoop or two for DImode, since the hardware has no
15662 non-popping instruction. We used to do this a different way, but
15663 that was somewhat fragile and broke with post-reload splitters. */
15664 if ((dimode_p || fisttp) && !stack_top_dies)
15665 output_asm_insn ("fld\t%y1", operands);
15667 gcc_assert (STACK_TOP_P (operands[1]));
15668 gcc_assert (MEM_P (operands[0]));
15669 gcc_assert (GET_MODE (operands[1]) != TFmode);
15671 if (fisttp)
15672 output_asm_insn ("fisttp%Z0\t%0", operands);
15673 else
15675 if (round_mode != I387_CW_ANY)
15676 output_asm_insn ("fldcw\t%3", operands);
15677 if (stack_top_dies || dimode_p)
15678 output_asm_insn ("fistp%Z0\t%0", operands);
15679 else
15680 output_asm_insn ("fist%Z0\t%0", operands);
15681 if (round_mode != I387_CW_ANY)
15682 output_asm_insn ("fldcw\t%2", operands);
15685 return "";
15688 /* Output code for x87 ffreep insn. The OPNO argument, which may only
15689 have the values zero or one, indicates the ffreep insn's operand
15690 from the OPERANDS array. */
15692 static const char *
15693 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
15695 if (TARGET_USE_FFREEP)
15696 #ifdef HAVE_AS_IX86_FFREEP
15697 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
15698 #else
15700 static char retval[32];
15701 int regno = REGNO (operands[opno]);
15703 gcc_assert (STACK_REGNO_P (regno));
15705 regno -= FIRST_STACK_REG;
15707 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
15708 return retval;
15710 #endif
15712 return opno ? "fstp\t%y1" : "fstp\t%y0";
15716 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
15717 should be used. UNORDERED_P is true when fucom should be used. */
15719 const char *
15720 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
15722 int stack_top_dies;
15723 rtx cmp_op0, cmp_op1;
15724 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
15726 if (eflags_p)
15728 cmp_op0 = operands[0];
15729 cmp_op1 = operands[1];
15731 else
15733 cmp_op0 = operands[1];
15734 cmp_op1 = operands[2];
15737 if (is_sse)
15739 if (GET_MODE (operands[0]) == SFmode)
15740 if (unordered_p)
15741 return "%vucomiss\t{%1, %0|%0, %1}";
15742 else
15743 return "%vcomiss\t{%1, %0|%0, %1}";
15744 else
15745 if (unordered_p)
15746 return "%vucomisd\t{%1, %0|%0, %1}";
15747 else
15748 return "%vcomisd\t{%1, %0|%0, %1}";
15751 gcc_assert (STACK_TOP_P (cmp_op0));
15753 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15755 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
15757 if (stack_top_dies)
15759 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
15760 return output_387_ffreep (operands, 1);
15762 else
15763 return "ftst\n\tfnstsw\t%0";
15766 if (STACK_REG_P (cmp_op1)
15767 && stack_top_dies
15768 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
15769 && REGNO (cmp_op1) != FIRST_STACK_REG)
15771 /* If both the top of the 387 stack dies, and the other operand
15772 is also a stack register that dies, then this must be a
15773 `fcompp' float compare */
15775 if (eflags_p)
15777 /* There is no double popping fcomi variant. Fortunately,
15778 eflags is immune from the fstp's cc clobbering. */
15779 if (unordered_p)
15780 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
15781 else
15782 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
15783 return output_387_ffreep (operands, 0);
15785 else
15787 if (unordered_p)
15788 return "fucompp\n\tfnstsw\t%0";
15789 else
15790 return "fcompp\n\tfnstsw\t%0";
15793 else
15795 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
15797 static const char * const alt[16] =
15799 "fcom%Z2\t%y2\n\tfnstsw\t%0",
15800 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
15801 "fucom%Z2\t%y2\n\tfnstsw\t%0",
15802 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
15804 "ficom%Z2\t%y2\n\tfnstsw\t%0",
15805 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
15806 NULL,
15807 NULL,
15809 "fcomi\t{%y1, %0|%0, %y1}",
15810 "fcomip\t{%y1, %0|%0, %y1}",
15811 "fucomi\t{%y1, %0|%0, %y1}",
15812 "fucomip\t{%y1, %0|%0, %y1}",
15814 NULL,
15815 NULL,
15816 NULL,
15817 NULL
15820 int mask;
15821 const char *ret;
15823 mask = eflags_p << 3;
15824 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
15825 mask |= unordered_p << 1;
15826 mask |= stack_top_dies;
15828 gcc_assert (mask < 16);
15829 ret = alt[mask];
15830 gcc_assert (ret);
15832 return ret;
15836 void
15837 ix86_output_addr_vec_elt (FILE *file, int value)
15839 const char *directive = ASM_LONG;
15841 #ifdef ASM_QUAD
15842 if (TARGET_LP64)
15843 directive = ASM_QUAD;
15844 #else
15845 gcc_assert (!TARGET_64BIT);
15846 #endif
15848 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
15851 void
15852 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
15854 const char *directive = ASM_LONG;
15856 #ifdef ASM_QUAD
15857 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
15858 directive = ASM_QUAD;
15859 #else
15860 gcc_assert (!TARGET_64BIT);
15861 #endif
15862 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
15863 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
15864 fprintf (file, "%s%s%d-%s%d\n",
15865 directive, LPREFIX, value, LPREFIX, rel);
15866 else if (HAVE_AS_GOTOFF_IN_DATA)
15867 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
15868 #if TARGET_MACHO
15869 else if (TARGET_MACHO)
15871 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
15872 machopic_output_function_base_name (file);
15873 putc ('\n', file);
15875 #endif
15876 else
15877 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
15878 GOT_SYMBOL_NAME, LPREFIX, value);
15881 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
15882 for the target. */
15884 void
15885 ix86_expand_clear (rtx dest)
15887 rtx tmp;
15889 /* We play register width games, which are only valid after reload. */
15890 gcc_assert (reload_completed);
15892 /* Avoid HImode and its attendant prefix byte. */
15893 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
15894 dest = gen_rtx_REG (SImode, REGNO (dest));
15895 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
15897 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
15898 if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
15900 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
15901 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
15904 emit_insn (tmp);
15907 /* X is an unchanging MEM. If it is a constant pool reference, return
15908 the constant pool rtx, else NULL. */
15911 maybe_get_pool_constant (rtx x)
15913 x = ix86_delegitimize_address (XEXP (x, 0));
15915 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
15916 return get_pool_constant (x);
15918 return NULL_RTX;
15921 void
15922 ix86_expand_move (enum machine_mode mode, rtx operands[])
15924 rtx op0, op1;
15925 enum tls_model model;
15927 op0 = operands[0];
15928 op1 = operands[1];
15930 if (GET_CODE (op1) == SYMBOL_REF)
15932 model = SYMBOL_REF_TLS_MODEL (op1);
15933 if (model)
15935 op1 = legitimize_tls_address (op1, model, true);
15936 op1 = force_operand (op1, op0);
15937 if (op1 == op0)
15938 return;
15939 op1 = convert_to_mode (mode, op1, 1);
15941 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15942 && SYMBOL_REF_DLLIMPORT_P (op1))
15943 op1 = legitimize_dllimport_symbol (op1, false);
15945 else if (GET_CODE (op1) == CONST
15946 && GET_CODE (XEXP (op1, 0)) == PLUS
15947 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
15949 rtx addend = XEXP (XEXP (op1, 0), 1);
15950 rtx symbol = XEXP (XEXP (op1, 0), 0);
15951 rtx tmp = NULL;
15953 model = SYMBOL_REF_TLS_MODEL (symbol);
15954 if (model)
15955 tmp = legitimize_tls_address (symbol, model, true);
15956 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15957 && SYMBOL_REF_DLLIMPORT_P (symbol))
15958 tmp = legitimize_dllimport_symbol (symbol, true);
15960 if (tmp)
15962 tmp = force_operand (tmp, NULL);
15963 tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
15964 op0, 1, OPTAB_DIRECT);
15965 if (tmp == op0)
15966 return;
15967 op1 = convert_to_mode (mode, tmp, 1);
15971 if ((flag_pic || MACHOPIC_INDIRECT)
15972 && symbolic_operand (op1, mode))
15974 if (TARGET_MACHO && !TARGET_64BIT)
15976 #if TARGET_MACHO
15977 /* dynamic-no-pic */
15978 if (MACHOPIC_INDIRECT)
15980 rtx temp = ((reload_in_progress
15981 || ((op0 && REG_P (op0))
15982 && mode == Pmode))
15983 ? op0 : gen_reg_rtx (Pmode));
15984 op1 = machopic_indirect_data_reference (op1, temp);
15985 if (MACHOPIC_PURE)
15986 op1 = machopic_legitimize_pic_address (op1, mode,
15987 temp == op1 ? 0 : temp);
15989 if (op0 != op1 && GET_CODE (op0) != MEM)
15991 rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
15992 emit_insn (insn);
15993 return;
15995 if (GET_CODE (op0) == MEM)
15996 op1 = force_reg (Pmode, op1);
15997 else
15999 rtx temp = op0;
16000 if (GET_CODE (temp) != REG)
16001 temp = gen_reg_rtx (Pmode);
16002 temp = legitimize_pic_address (op1, temp);
16003 if (temp == op0)
16004 return;
16005 op1 = temp;
16007 /* dynamic-no-pic */
16008 #endif
16010 else
16012 if (MEM_P (op0))
16013 op1 = force_reg (mode, op1);
16014 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
16016 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
16017 op1 = legitimize_pic_address (op1, reg);
16018 if (op0 == op1)
16019 return;
16020 op1 = convert_to_mode (mode, op1, 1);
16024 else
16026 if (MEM_P (op0)
16027 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
16028 || !push_operand (op0, mode))
16029 && MEM_P (op1))
16030 op1 = force_reg (mode, op1);
16032 if (push_operand (op0, mode)
16033 && ! general_no_elim_operand (op1, mode))
16034 op1 = copy_to_mode_reg (mode, op1);
16036 /* Force large constants in 64bit compilation into register
16037 to get them CSEed. */
16038 if (can_create_pseudo_p ()
16039 && (mode == DImode) && TARGET_64BIT
16040 && immediate_operand (op1, mode)
16041 && !x86_64_zext_immediate_operand (op1, VOIDmode)
16042 && !register_operand (op0, mode)
16043 && optimize)
16044 op1 = copy_to_mode_reg (mode, op1);
16046 if (can_create_pseudo_p ()
16047 && FLOAT_MODE_P (mode)
16048 && GET_CODE (op1) == CONST_DOUBLE)
16050 /* If we are loading a floating point constant to a register,
16051 force the value to memory now, since we'll get better code
16052 out the back end. */
16054 op1 = validize_mem (force_const_mem (mode, op1));
16055 if (!register_operand (op0, mode))
16057 rtx temp = gen_reg_rtx (mode);
16058 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
16059 emit_move_insn (op0, temp);
16060 return;
16065 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16068 void
16069 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
16071 rtx op0 = operands[0], op1 = operands[1];
16072 unsigned int align = GET_MODE_ALIGNMENT (mode);
16074 /* Force constants other than zero into memory. We do not know how
16075 the instructions used to build constants modify the upper 64 bits
16076 of the register, once we have that information we may be able
16077 to handle some of them more efficiently. */
16078 if (can_create_pseudo_p ()
16079 && register_operand (op0, mode)
16080 && (CONSTANT_P (op1)
16081 || (GET_CODE (op1) == SUBREG
16082 && CONSTANT_P (SUBREG_REG (op1))))
16083 && !standard_sse_constant_p (op1))
16084 op1 = validize_mem (force_const_mem (mode, op1));
16086 /* We need to check memory alignment for SSE mode since attribute
16087 can make operands unaligned. */
16088 if (can_create_pseudo_p ()
16089 && SSE_REG_MODE_P (mode)
16090 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
16091 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
16093 rtx tmp[2];
16095 /* ix86_expand_vector_move_misalign() does not like constants ... */
16096 if (CONSTANT_P (op1)
16097 || (GET_CODE (op1) == SUBREG
16098 && CONSTANT_P (SUBREG_REG (op1))))
16099 op1 = validize_mem (force_const_mem (mode, op1));
16101 /* ... nor both arguments in memory. */
16102 if (!register_operand (op0, mode)
16103 && !register_operand (op1, mode))
16104 op1 = force_reg (mode, op1);
16106 tmp[0] = op0; tmp[1] = op1;
16107 ix86_expand_vector_move_misalign (mode, tmp);
16108 return;
16111 /* Make operand1 a register if it isn't already. */
16112 if (can_create_pseudo_p ()
16113 && !register_operand (op0, mode)
16114 && !register_operand (op1, mode))
16116 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
16117 return;
16120 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16123 /* Split 32-byte AVX unaligned load and store if needed. */
16125 static void
16126 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
16128 rtx m;
16129 rtx (*extract) (rtx, rtx, rtx);
16130 rtx (*load_unaligned) (rtx, rtx);
16131 rtx (*store_unaligned) (rtx, rtx);
16132 enum machine_mode mode;
16134 switch (GET_MODE (op0))
16136 default:
16137 gcc_unreachable ();
16138 case V32QImode:
16139 extract = gen_avx_vextractf128v32qi;
16140 load_unaligned = gen_avx_loaddqu256;
16141 store_unaligned = gen_avx_storedqu256;
16142 mode = V16QImode;
16143 break;
16144 case V8SFmode:
16145 extract = gen_avx_vextractf128v8sf;
16146 load_unaligned = gen_avx_loadups256;
16147 store_unaligned = gen_avx_storeups256;
16148 mode = V4SFmode;
16149 break;
16150 case V4DFmode:
16151 extract = gen_avx_vextractf128v4df;
16152 load_unaligned = gen_avx_loadupd256;
16153 store_unaligned = gen_avx_storeupd256;
16154 mode = V2DFmode;
16155 break;
16158 if (MEM_P (op1))
16160 if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
16162 rtx r = gen_reg_rtx (mode);
16163 m = adjust_address (op1, mode, 0);
16164 emit_move_insn (r, m);
16165 m = adjust_address (op1, mode, 16);
16166 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
16167 emit_move_insn (op0, r);
16169 else
16170 emit_insn (load_unaligned (op0, op1));
16172 else if (MEM_P (op0))
16174 if (TARGET_AVX256_SPLIT_UNALIGNED_STORE)
16176 m = adjust_address (op0, mode, 0);
16177 emit_insn (extract (m, op1, const0_rtx));
16178 m = adjust_address (op0, mode, 16);
16179 emit_insn (extract (m, op1, const1_rtx));
16181 else
16182 emit_insn (store_unaligned (op0, op1));
16184 else
16185 gcc_unreachable ();
16188 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
16189 straight to ix86_expand_vector_move. */
16190 /* Code generation for scalar reg-reg moves of single and double precision data:
16191 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
16192 movaps reg, reg
16193 else
16194 movss reg, reg
16195 if (x86_sse_partial_reg_dependency == true)
16196 movapd reg, reg
16197 else
16198 movsd reg, reg
16200 Code generation for scalar loads of double precision data:
16201 if (x86_sse_split_regs == true)
16202 movlpd mem, reg (gas syntax)
16203 else
16204 movsd mem, reg
16206 Code generation for unaligned packed loads of single precision data
16207 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
16208 if (x86_sse_unaligned_move_optimal)
16209 movups mem, reg
16211 if (x86_sse_partial_reg_dependency == true)
16213 xorps reg, reg
16214 movlps mem, reg
16215 movhps mem+8, reg
16217 else
16219 movlps mem, reg
16220 movhps mem+8, reg
16223 Code generation for unaligned packed loads of double precision data
16224 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
16225 if (x86_sse_unaligned_move_optimal)
16226 movupd mem, reg
16228 if (x86_sse_split_regs == true)
16230 movlpd mem, reg
16231 movhpd mem+8, reg
16233 else
16235 movsd mem, reg
16236 movhpd mem+8, reg
16240 void
16241 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
16243 rtx op0, op1, m;
16245 op0 = operands[0];
16246 op1 = operands[1];
16248 if (TARGET_AVX
16249 && GET_MODE_SIZE (mode) == 32)
16251 switch (GET_MODE_CLASS (mode))
16253 case MODE_VECTOR_INT:
16254 case MODE_INT:
16255 op0 = gen_lowpart (V32QImode, op0);
16256 op1 = gen_lowpart (V32QImode, op1);
16257 /* FALLTHRU */
16259 case MODE_VECTOR_FLOAT:
16260 ix86_avx256_split_vector_move_misalign (op0, op1);
16261 break;
16263 default:
16264 gcc_unreachable ();
16267 return;
16270 if (MEM_P (op1))
16272 /* ??? If we have typed data, then it would appear that using
16273 movdqu is the only way to get unaligned data loaded with
16274 integer type. */
16275 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16277 op0 = gen_lowpart (V16QImode, op0);
16278 op1 = gen_lowpart (V16QImode, op1);
16279 /* We will eventually emit movups based on insn attributes. */
16280 emit_insn (gen_sse2_loaddqu (op0, op1));
16282 else if (TARGET_SSE2 && mode == V2DFmode)
16284 rtx zero;
16286 if (TARGET_AVX
16287 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16288 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16289 || optimize_function_for_size_p (cfun))
16291 /* We will eventually emit movups based on insn attributes. */
16292 emit_insn (gen_sse2_loadupd (op0, op1));
16293 return;
16296 /* When SSE registers are split into halves, we can avoid
16297 writing to the top half twice. */
16298 if (TARGET_SSE_SPLIT_REGS)
16300 emit_clobber (op0);
16301 zero = op0;
16303 else
16305 /* ??? Not sure about the best option for the Intel chips.
16306 The following would seem to satisfy; the register is
16307 entirely cleared, breaking the dependency chain. We
16308 then store to the upper half, with a dependency depth
16309 of one. A rumor has it that Intel recommends two movsd
16310 followed by an unpacklpd, but this is unconfirmed. And
16311 given that the dependency depth of the unpacklpd would
16312 still be one, I'm not sure why this would be better. */
16313 zero = CONST0_RTX (V2DFmode);
16316 m = adjust_address (op1, DFmode, 0);
16317 emit_insn (gen_sse2_loadlpd (op0, zero, m));
16318 m = adjust_address (op1, DFmode, 8);
16319 emit_insn (gen_sse2_loadhpd (op0, op0, m));
16321 else
16323 if (TARGET_AVX
16324 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16325 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16326 || optimize_function_for_size_p (cfun))
16328 op0 = gen_lowpart (V4SFmode, op0);
16329 op1 = gen_lowpart (V4SFmode, op1);
16330 emit_insn (gen_sse_loadups (op0, op1));
16331 return;
16334 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
16335 emit_move_insn (op0, CONST0_RTX (mode));
16336 else
16337 emit_clobber (op0);
16339 if (mode != V4SFmode)
16340 op0 = gen_lowpart (V4SFmode, op0);
16342 m = adjust_address (op1, V2SFmode, 0);
16343 emit_insn (gen_sse_loadlps (op0, op0, m));
16344 m = adjust_address (op1, V2SFmode, 8);
16345 emit_insn (gen_sse_loadhps (op0, op0, m));
16348 else if (MEM_P (op0))
16350 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16352 op0 = gen_lowpart (V16QImode, op0);
16353 op1 = gen_lowpart (V16QImode, op1);
16354 /* We will eventually emit movups based on insn attributes. */
16355 emit_insn (gen_sse2_storedqu (op0, op1));
16357 else if (TARGET_SSE2 && mode == V2DFmode)
16359 if (TARGET_AVX
16360 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16361 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16362 || optimize_function_for_size_p (cfun))
16363 /* We will eventually emit movups based on insn attributes. */
16364 emit_insn (gen_sse2_storeupd (op0, op1));
16365 else
16367 m = adjust_address (op0, DFmode, 0);
16368 emit_insn (gen_sse2_storelpd (m, op1));
16369 m = adjust_address (op0, DFmode, 8);
16370 emit_insn (gen_sse2_storehpd (m, op1));
16373 else
16375 if (mode != V4SFmode)
16376 op1 = gen_lowpart (V4SFmode, op1);
16378 if (TARGET_AVX
16379 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16380 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16381 || optimize_function_for_size_p (cfun))
16383 op0 = gen_lowpart (V4SFmode, op0);
16384 emit_insn (gen_sse_storeups (op0, op1));
16386 else
16388 m = adjust_address (op0, V2SFmode, 0);
16389 emit_insn (gen_sse_storelps (m, op1));
16390 m = adjust_address (op0, V2SFmode, 8);
16391 emit_insn (gen_sse_storehps (m, op1));
16395 else
16396 gcc_unreachable ();
16399 /* Expand a push in MODE. This is some mode for which we do not support
16400 proper push instructions, at least from the registers that we expect
16401 the value to live in. */
16403 void
16404 ix86_expand_push (enum machine_mode mode, rtx x)
16406 rtx tmp;
16408 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
16409 GEN_INT (-GET_MODE_SIZE (mode)),
16410 stack_pointer_rtx, 1, OPTAB_DIRECT);
16411 if (tmp != stack_pointer_rtx)
16412 emit_move_insn (stack_pointer_rtx, tmp);
16414 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
16416 /* When we push an operand onto stack, it has to be aligned at least
16417 at the function argument boundary. However since we don't have
16418 the argument type, we can't determine the actual argument
16419 boundary. */
16420 emit_move_insn (tmp, x);
16423 /* Helper function of ix86_fixup_binary_operands to canonicalize
16424 operand order. Returns true if the operands should be swapped. */
16426 static bool
16427 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
16428 rtx operands[])
16430 rtx dst = operands[0];
16431 rtx src1 = operands[1];
16432 rtx src2 = operands[2];
16434 /* If the operation is not commutative, we can't do anything. */
16435 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
16436 return false;
16438 /* Highest priority is that src1 should match dst. */
16439 if (rtx_equal_p (dst, src1))
16440 return false;
16441 if (rtx_equal_p (dst, src2))
16442 return true;
16444 /* Next highest priority is that immediate constants come second. */
16445 if (immediate_operand (src2, mode))
16446 return false;
16447 if (immediate_operand (src1, mode))
16448 return true;
16450 /* Lowest priority is that memory references should come second. */
16451 if (MEM_P (src2))
16452 return false;
16453 if (MEM_P (src1))
16454 return true;
16456 return false;
16460 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
16461 destination to use for the operation. If different from the true
16462 destination in operands[0], a copy operation will be required. */
16465 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
16466 rtx operands[])
16468 rtx dst = operands[0];
16469 rtx src1 = operands[1];
16470 rtx src2 = operands[2];
16472 /* Canonicalize operand order. */
16473 if (ix86_swap_binary_operands_p (code, mode, operands))
16475 rtx temp;
16477 /* It is invalid to swap operands of different modes. */
16478 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
16480 temp = src1;
16481 src1 = src2;
16482 src2 = temp;
16485 /* Both source operands cannot be in memory. */
16486 if (MEM_P (src1) && MEM_P (src2))
16488 /* Optimization: Only read from memory once. */
16489 if (rtx_equal_p (src1, src2))
16491 src2 = force_reg (mode, src2);
16492 src1 = src2;
16494 else
16495 src2 = force_reg (mode, src2);
16498 /* If the destination is memory, and we do not have matching source
16499 operands, do things in registers. */
16500 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16501 dst = gen_reg_rtx (mode);
16503 /* Source 1 cannot be a constant. */
16504 if (CONSTANT_P (src1))
16505 src1 = force_reg (mode, src1);
16507 /* Source 1 cannot be a non-matching memory. */
16508 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16509 src1 = force_reg (mode, src1);
16511 /* Improve address combine. */
16512 if (code == PLUS
16513 && GET_MODE_CLASS (mode) == MODE_INT
16514 && MEM_P (src2))
16515 src2 = force_reg (mode, src2);
16517 operands[1] = src1;
16518 operands[2] = src2;
16519 return dst;
16522 /* Similarly, but assume that the destination has already been
16523 set up properly. */
16525 void
16526 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
16527 enum machine_mode mode, rtx operands[])
16529 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
16530 gcc_assert (dst == operands[0]);
16533 /* Attempt to expand a binary operator. Make the expansion closer to the
16534 actual machine, then just general_operand, which will allow 3 separate
16535 memory references (one output, two input) in a single insn. */
16537 void
16538 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
16539 rtx operands[])
16541 rtx src1, src2, dst, op, clob;
16543 dst = ix86_fixup_binary_operands (code, mode, operands);
16544 src1 = operands[1];
16545 src2 = operands[2];
16547 /* Emit the instruction. */
16549 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
16550 if (reload_in_progress)
16552 /* Reload doesn't know about the flags register, and doesn't know that
16553 it doesn't want to clobber it. We can only do this with PLUS. */
16554 gcc_assert (code == PLUS);
16555 emit_insn (op);
16557 else if (reload_completed
16558 && code == PLUS
16559 && !rtx_equal_p (dst, src1))
16561 /* This is going to be an LEA; avoid splitting it later. */
16562 emit_insn (op);
16564 else
16566 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16567 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16570 /* Fix up the destination if needed. */
16571 if (dst != operands[0])
16572 emit_move_insn (operands[0], dst);
16575 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
16576 the given OPERANDS. */
16578 void
16579 ix86_expand_vector_logical_operator (enum rtx_code code, enum machine_mode mode,
16580 rtx operands[])
16582 rtx op1 = NULL_RTX, op2 = NULL_RTX;
16583 if (GET_CODE (operands[1]) == SUBREG)
16585 op1 = operands[1];
16586 op2 = operands[2];
16588 else if (GET_CODE (operands[2]) == SUBREG)
16590 op1 = operands[2];
16591 op2 = operands[1];
16593 /* Optimize (__m128i) d | (__m128i) e and similar code
16594 when d and e are float vectors into float vector logical
16595 insn. In C/C++ without using intrinsics there is no other way
16596 to express vector logical operation on float vectors than
16597 to cast them temporarily to integer vectors. */
16598 if (op1
16599 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16600 && ((GET_CODE (op2) == SUBREG || GET_CODE (op2) == CONST_VECTOR))
16601 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
16602 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
16603 && SUBREG_BYTE (op1) == 0
16604 && (GET_CODE (op2) == CONST_VECTOR
16605 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
16606 && SUBREG_BYTE (op2) == 0))
16607 && can_create_pseudo_p ())
16609 rtx dst;
16610 switch (GET_MODE (SUBREG_REG (op1)))
16612 case V4SFmode:
16613 case V8SFmode:
16614 case V2DFmode:
16615 case V4DFmode:
16616 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
16617 if (GET_CODE (op2) == CONST_VECTOR)
16619 op2 = gen_lowpart (GET_MODE (dst), op2);
16620 op2 = force_reg (GET_MODE (dst), op2);
16622 else
16624 op1 = operands[1];
16625 op2 = SUBREG_REG (operands[2]);
16626 if (!nonimmediate_operand (op2, GET_MODE (dst)))
16627 op2 = force_reg (GET_MODE (dst), op2);
16629 op1 = SUBREG_REG (op1);
16630 if (!nonimmediate_operand (op1, GET_MODE (dst)))
16631 op1 = force_reg (GET_MODE (dst), op1);
16632 emit_insn (gen_rtx_SET (VOIDmode, dst,
16633 gen_rtx_fmt_ee (code, GET_MODE (dst),
16634 op1, op2)));
16635 emit_move_insn (operands[0], gen_lowpart (mode, dst));
16636 return;
16637 default:
16638 break;
16641 if (!nonimmediate_operand (operands[1], mode))
16642 operands[1] = force_reg (mode, operands[1]);
16643 if (!nonimmediate_operand (operands[2], mode))
16644 operands[2] = force_reg (mode, operands[2]);
16645 ix86_fixup_binary_operands_no_copy (code, mode, operands);
16646 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
16647 gen_rtx_fmt_ee (code, mode, operands[1],
16648 operands[2])));
16651 /* Return TRUE or FALSE depending on whether the binary operator meets the
16652 appropriate constraints. */
16654 bool
16655 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
16656 rtx operands[3])
16658 rtx dst = operands[0];
16659 rtx src1 = operands[1];
16660 rtx src2 = operands[2];
16662 /* Both source operands cannot be in memory. */
16663 if (MEM_P (src1) && MEM_P (src2))
16664 return false;
16666 /* Canonicalize operand order for commutative operators. */
16667 if (ix86_swap_binary_operands_p (code, mode, operands))
16669 rtx temp = src1;
16670 src1 = src2;
16671 src2 = temp;
16674 /* If the destination is memory, we must have a matching source operand. */
16675 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16676 return false;
16678 /* Source 1 cannot be a constant. */
16679 if (CONSTANT_P (src1))
16680 return false;
16682 /* Source 1 cannot be a non-matching memory. */
16683 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16684 /* Support "andhi/andsi/anddi" as a zero-extending move. */
16685 return (code == AND
16686 && (mode == HImode
16687 || mode == SImode
16688 || (TARGET_64BIT && mode == DImode))
16689 && satisfies_constraint_L (src2));
16691 return true;
16694 /* Attempt to expand a unary operator. Make the expansion closer to the
16695 actual machine, then just general_operand, which will allow 2 separate
16696 memory references (one output, one input) in a single insn. */
16698 void
16699 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
16700 rtx operands[])
16702 int matching_memory;
16703 rtx src, dst, op, clob;
16705 dst = operands[0];
16706 src = operands[1];
16708 /* If the destination is memory, and we do not have matching source
16709 operands, do things in registers. */
16710 matching_memory = 0;
16711 if (MEM_P (dst))
16713 if (rtx_equal_p (dst, src))
16714 matching_memory = 1;
16715 else
16716 dst = gen_reg_rtx (mode);
16719 /* When source operand is memory, destination must match. */
16720 if (MEM_P (src) && !matching_memory)
16721 src = force_reg (mode, src);
16723 /* Emit the instruction. */
16725 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
16726 if (reload_in_progress || code == NOT)
16728 /* Reload doesn't know about the flags register, and doesn't know that
16729 it doesn't want to clobber it. */
16730 gcc_assert (code == NOT);
16731 emit_insn (op);
16733 else
16735 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16736 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16739 /* Fix up the destination if needed. */
16740 if (dst != operands[0])
16741 emit_move_insn (operands[0], dst);
16744 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
16745 divisor are within the range [0-255]. */
16747 void
16748 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
16749 bool signed_p)
16751 rtx end_label, qimode_label;
16752 rtx insn, div, mod;
16753 rtx scratch, tmp0, tmp1, tmp2;
16754 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
16755 rtx (*gen_zero_extend) (rtx, rtx);
16756 rtx (*gen_test_ccno_1) (rtx, rtx);
16758 switch (mode)
16760 case SImode:
16761 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
16762 gen_test_ccno_1 = gen_testsi_ccno_1;
16763 gen_zero_extend = gen_zero_extendqisi2;
16764 break;
16765 case DImode:
16766 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
16767 gen_test_ccno_1 = gen_testdi_ccno_1;
16768 gen_zero_extend = gen_zero_extendqidi2;
16769 break;
16770 default:
16771 gcc_unreachable ();
16774 end_label = gen_label_rtx ();
16775 qimode_label = gen_label_rtx ();
16777 scratch = gen_reg_rtx (mode);
16779 /* Use 8bit unsigned divimod if dividend and divisor are within
16780 the range [0-255]. */
16781 emit_move_insn (scratch, operands[2]);
16782 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
16783 scratch, 1, OPTAB_DIRECT);
16784 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
16785 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
16786 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
16787 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
16788 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
16789 pc_rtx);
16790 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
16791 predict_jump (REG_BR_PROB_BASE * 50 / 100);
16792 JUMP_LABEL (insn) = qimode_label;
16794 /* Generate original signed/unsigned divimod. */
16795 div = gen_divmod4_1 (operands[0], operands[1],
16796 operands[2], operands[3]);
16797 emit_insn (div);
16799 /* Branch to the end. */
16800 emit_jump_insn (gen_jump (end_label));
16801 emit_barrier ();
16803 /* Generate 8bit unsigned divide. */
16804 emit_label (qimode_label);
16805 /* Don't use operands[0] for result of 8bit divide since not all
16806 registers support QImode ZERO_EXTRACT. */
16807 tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
16808 tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
16809 tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
16810 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
16812 if (signed_p)
16814 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
16815 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
16817 else
16819 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
16820 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
16823 /* Extract remainder from AH. */
16824 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
16825 if (REG_P (operands[1]))
16826 insn = emit_move_insn (operands[1], tmp1);
16827 else
16829 /* Need a new scratch register since the old one has result
16830 of 8bit divide. */
16831 scratch = gen_reg_rtx (mode);
16832 emit_move_insn (scratch, tmp1);
16833 insn = emit_move_insn (operands[1], scratch);
16835 set_unique_reg_note (insn, REG_EQUAL, mod);
16837 /* Zero extend quotient from AL. */
16838 tmp1 = gen_lowpart (QImode, tmp0);
16839 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
16840 set_unique_reg_note (insn, REG_EQUAL, div);
16842 emit_label (end_label);
16845 #define LEA_MAX_STALL (3)
16846 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
16848 /* Increase given DISTANCE in half-cycles according to
16849 dependencies between PREV and NEXT instructions.
16850 Add 1 half-cycle if there is no dependency and
16851 go to next cycle if there is some dependecy. */
16853 static unsigned int
16854 increase_distance (rtx prev, rtx next, unsigned int distance)
16856 df_ref *use_rec;
16857 df_ref *def_rec;
16859 if (!prev || !next)
16860 return distance + (distance & 1) + 2;
16862 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
16863 return distance + 1;
16865 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
16866 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
16867 if (!DF_REF_IS_ARTIFICIAL (*def_rec)
16868 && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
16869 return distance + (distance & 1) + 2;
16871 return distance + 1;
16874 /* Function checks if instruction INSN defines register number
16875 REGNO1 or REGNO2. */
16877 static bool
16878 insn_defines_reg (unsigned int regno1, unsigned int regno2,
16879 rtx insn)
16881 df_ref *def_rec;
16883 for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
16884 if (DF_REF_REG_DEF_P (*def_rec)
16885 && !DF_REF_IS_ARTIFICIAL (*def_rec)
16886 && (regno1 == DF_REF_REGNO (*def_rec)
16887 || regno2 == DF_REF_REGNO (*def_rec)))
16889 return true;
16892 return false;
16895 /* Function checks if instruction INSN uses register number
16896 REGNO as a part of address expression. */
16898 static bool
16899 insn_uses_reg_mem (unsigned int regno, rtx insn)
16901 df_ref *use_rec;
16903 for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
16904 if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
16905 return true;
16907 return false;
16910 /* Search backward for non-agu definition of register number REGNO1
16911 or register number REGNO2 in basic block starting from instruction
16912 START up to head of basic block or instruction INSN.
16914 Function puts true value into *FOUND var if definition was found
16915 and false otherwise.
16917 Distance in half-cycles between START and found instruction or head
16918 of BB is added to DISTANCE and returned. */
16920 static int
16921 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
16922 rtx insn, int distance,
16923 rtx start, bool *found)
16925 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16926 rtx prev = start;
16927 rtx next = NULL;
16929 *found = false;
16931 while (prev
16932 && prev != insn
16933 && distance < LEA_SEARCH_THRESHOLD)
16935 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
16937 distance = increase_distance (prev, next, distance);
16938 if (insn_defines_reg (regno1, regno2, prev))
16940 if (recog_memoized (prev) < 0
16941 || get_attr_type (prev) != TYPE_LEA)
16943 *found = true;
16944 return distance;
16948 next = prev;
16950 if (prev == BB_HEAD (bb))
16951 break;
16953 prev = PREV_INSN (prev);
16956 return distance;
16959 /* Search backward for non-agu definition of register number REGNO1
16960 or register number REGNO2 in INSN's basic block until
16961 1. Pass LEA_SEARCH_THRESHOLD instructions, or
16962 2. Reach neighbour BBs boundary, or
16963 3. Reach agu definition.
16964 Returns the distance between the non-agu definition point and INSN.
16965 If no definition point, returns -1. */
16967 static int
16968 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
16969 rtx insn)
16971 basic_block bb = BLOCK_FOR_INSN (insn);
16972 int distance = 0;
16973 bool found = false;
16975 if (insn != BB_HEAD (bb))
16976 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
16977 distance, PREV_INSN (insn),
16978 &found);
16980 if (!found && distance < LEA_SEARCH_THRESHOLD)
16982 edge e;
16983 edge_iterator ei;
16984 bool simple_loop = false;
16986 FOR_EACH_EDGE (e, ei, bb->preds)
16987 if (e->src == bb)
16989 simple_loop = true;
16990 break;
16993 if (simple_loop)
16994 distance = distance_non_agu_define_in_bb (regno1, regno2,
16995 insn, distance,
16996 BB_END (bb), &found);
16997 else
16999 int shortest_dist = -1;
17000 bool found_in_bb = false;
17002 FOR_EACH_EDGE (e, ei, bb->preds)
17004 int bb_dist
17005 = distance_non_agu_define_in_bb (regno1, regno2,
17006 insn, distance,
17007 BB_END (e->src),
17008 &found_in_bb);
17009 if (found_in_bb)
17011 if (shortest_dist < 0)
17012 shortest_dist = bb_dist;
17013 else if (bb_dist > 0)
17014 shortest_dist = MIN (bb_dist, shortest_dist);
17016 found = true;
17020 distance = shortest_dist;
17024 /* get_attr_type may modify recog data. We want to make sure
17025 that recog data is valid for instruction INSN, on which
17026 distance_non_agu_define is called. INSN is unchanged here. */
17027 extract_insn_cached (insn);
17029 if (!found)
17030 return -1;
17032 return distance >> 1;
17035 /* Return the distance in half-cycles between INSN and the next
17036 insn that uses register number REGNO in memory address added
17037 to DISTANCE. Return -1 if REGNO0 is set.
17039 Put true value into *FOUND if register usage was found and
17040 false otherwise.
17041 Put true value into *REDEFINED if register redefinition was
17042 found and false otherwise. */
17044 static int
17045 distance_agu_use_in_bb (unsigned int regno,
17046 rtx insn, int distance, rtx start,
17047 bool *found, bool *redefined)
17049 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
17050 rtx next = start;
17051 rtx prev = NULL;
17053 *found = false;
17054 *redefined = false;
17056 while (next
17057 && next != insn
17058 && distance < LEA_SEARCH_THRESHOLD)
17060 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
17062 distance = increase_distance(prev, next, distance);
17063 if (insn_uses_reg_mem (regno, next))
17065 /* Return DISTANCE if OP0 is used in memory
17066 address in NEXT. */
17067 *found = true;
17068 return distance;
17071 if (insn_defines_reg (regno, INVALID_REGNUM, next))
17073 /* Return -1 if OP0 is set in NEXT. */
17074 *redefined = true;
17075 return -1;
17078 prev = next;
17081 if (next == BB_END (bb))
17082 break;
17084 next = NEXT_INSN (next);
17087 return distance;
17090 /* Return the distance between INSN and the next insn that uses
17091 register number REGNO0 in memory address. Return -1 if no such
17092 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
17094 static int
17095 distance_agu_use (unsigned int regno0, rtx insn)
17097 basic_block bb = BLOCK_FOR_INSN (insn);
17098 int distance = 0;
17099 bool found = false;
17100 bool redefined = false;
17102 if (insn != BB_END (bb))
17103 distance = distance_agu_use_in_bb (regno0, insn, distance,
17104 NEXT_INSN (insn),
17105 &found, &redefined);
17107 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
17109 edge e;
17110 edge_iterator ei;
17111 bool simple_loop = false;
17113 FOR_EACH_EDGE (e, ei, bb->succs)
17114 if (e->dest == bb)
17116 simple_loop = true;
17117 break;
17120 if (simple_loop)
17121 distance = distance_agu_use_in_bb (regno0, insn,
17122 distance, BB_HEAD (bb),
17123 &found, &redefined);
17124 else
17126 int shortest_dist = -1;
17127 bool found_in_bb = false;
17128 bool redefined_in_bb = false;
17130 FOR_EACH_EDGE (e, ei, bb->succs)
17132 int bb_dist
17133 = distance_agu_use_in_bb (regno0, insn,
17134 distance, BB_HEAD (e->dest),
17135 &found_in_bb, &redefined_in_bb);
17136 if (found_in_bb)
17138 if (shortest_dist < 0)
17139 shortest_dist = bb_dist;
17140 else if (bb_dist > 0)
17141 shortest_dist = MIN (bb_dist, shortest_dist);
17143 found = true;
17147 distance = shortest_dist;
17151 if (!found || redefined)
17152 return -1;
17154 return distance >> 1;
17157 /* Define this macro to tune LEA priority vs ADD, it take effect when
17158 there is a dilemma of choicing LEA or ADD
17159 Negative value: ADD is more preferred than LEA
17160 Zero: Netrual
17161 Positive value: LEA is more preferred than ADD*/
17162 #define IX86_LEA_PRIORITY 0
17164 /* Return true if usage of lea INSN has performance advantage
17165 over a sequence of instructions. Instructions sequence has
17166 SPLIT_COST cycles higher latency than lea latency. */
17168 static bool
17169 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
17170 unsigned int regno2, int split_cost)
17172 int dist_define, dist_use;
17174 dist_define = distance_non_agu_define (regno1, regno2, insn);
17175 dist_use = distance_agu_use (regno0, insn);
17177 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
17179 /* If there is no non AGU operand definition, no AGU
17180 operand usage and split cost is 0 then both lea
17181 and non lea variants have same priority. Currently
17182 we prefer lea for 64 bit code and non lea on 32 bit
17183 code. */
17184 if (dist_use < 0 && split_cost == 0)
17185 return TARGET_64BIT || IX86_LEA_PRIORITY;
17186 else
17187 return true;
17190 /* With longer definitions distance lea is more preferable.
17191 Here we change it to take into account splitting cost and
17192 lea priority. */
17193 dist_define += split_cost + IX86_LEA_PRIORITY;
17195 /* If there is no use in memory addess then we just check
17196 that split cost exceeds AGU stall. */
17197 if (dist_use < 0)
17198 return dist_define > LEA_MAX_STALL;
17200 /* If this insn has both backward non-agu dependence and forward
17201 agu dependence, the one with short distance takes effect. */
17202 return dist_define >= dist_use;
17205 /* Return true if it is legal to clobber flags by INSN and
17206 false otherwise. */
17208 static bool
17209 ix86_ok_to_clobber_flags (rtx insn)
17211 basic_block bb = BLOCK_FOR_INSN (insn);
17212 df_ref *use;
17213 bitmap live;
17215 while (insn)
17217 if (NONDEBUG_INSN_P (insn))
17219 for (use = DF_INSN_USES (insn); *use; use++)
17220 if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
17221 return false;
17223 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
17224 return true;
17227 if (insn == BB_END (bb))
17228 break;
17230 insn = NEXT_INSN (insn);
17233 live = df_get_live_out(bb);
17234 return !REGNO_REG_SET_P (live, FLAGS_REG);
17237 /* Return true if we need to split op0 = op1 + op2 into a sequence of
17238 move and add to avoid AGU stalls. */
17240 bool
17241 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
17243 unsigned int regno0, regno1, regno2;
17245 /* Check if we need to optimize. */
17246 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17247 return false;
17249 /* Check it is correct to split here. */
17250 if (!ix86_ok_to_clobber_flags(insn))
17251 return false;
17253 regno0 = true_regnum (operands[0]);
17254 regno1 = true_regnum (operands[1]);
17255 regno2 = true_regnum (operands[2]);
17257 /* We need to split only adds with non destructive
17258 destination operand. */
17259 if (regno0 == regno1 || regno0 == regno2)
17260 return false;
17261 else
17262 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1);
17265 /* Return true if we should emit lea instruction instead of mov
17266 instruction. */
17268 bool
17269 ix86_use_lea_for_mov (rtx insn, rtx operands[])
17271 unsigned int regno0, regno1;
17273 /* Check if we need to optimize. */
17274 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17275 return false;
17277 /* Use lea for reg to reg moves only. */
17278 if (!REG_P (operands[0]) || !REG_P (operands[1]))
17279 return false;
17281 regno0 = true_regnum (operands[0]);
17282 regno1 = true_regnum (operands[1]);
17284 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0);
17287 /* Return true if we need to split lea into a sequence of
17288 instructions to avoid AGU stalls. */
17290 bool
17291 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
17293 unsigned int regno0, regno1, regno2;
17294 int split_cost;
17295 struct ix86_address parts;
17296 int ok;
17298 /* Check we need to optimize. */
17299 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17300 return false;
17302 /* Check it is correct to split here. */
17303 if (!ix86_ok_to_clobber_flags(insn))
17304 return false;
17306 ok = ix86_decompose_address (operands[1], &parts);
17307 gcc_assert (ok);
17309 /* There should be at least two components in the address. */
17310 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
17311 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
17312 return false;
17314 /* We should not split into add if non legitimate pic
17315 operand is used as displacement. */
17316 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
17317 return false;
17319 regno0 = true_regnum (operands[0]) ;
17320 regno1 = INVALID_REGNUM;
17321 regno2 = INVALID_REGNUM;
17323 if (parts.base)
17324 regno1 = true_regnum (parts.base);
17325 if (parts.index)
17326 regno2 = true_regnum (parts.index);
17328 split_cost = 0;
17330 /* Compute how many cycles we will add to execution time
17331 if split lea into a sequence of instructions. */
17332 if (parts.base || parts.index)
17334 /* Have to use mov instruction if non desctructive
17335 destination form is used. */
17336 if (regno1 != regno0 && regno2 != regno0)
17337 split_cost += 1;
17339 /* Have to add index to base if both exist. */
17340 if (parts.base && parts.index)
17341 split_cost += 1;
17343 /* Have to use shift and adds if scale is 2 or greater. */
17344 if (parts.scale > 1)
17346 if (regno0 != regno1)
17347 split_cost += 1;
17348 else if (regno2 == regno0)
17349 split_cost += 4;
17350 else
17351 split_cost += parts.scale;
17354 /* Have to use add instruction with immediate if
17355 disp is non zero. */
17356 if (parts.disp && parts.disp != const0_rtx)
17357 split_cost += 1;
17359 /* Subtract the price of lea. */
17360 split_cost -= 1;
17363 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost);
17366 /* Emit x86 binary operand CODE in mode MODE, where the first operand
17367 matches destination. RTX includes clobber of FLAGS_REG. */
17369 static void
17370 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
17371 rtx dst, rtx src)
17373 rtx op, clob;
17375 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
17376 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17378 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
17381 /* Return true if regno1 def is nearest to the insn. */
17383 static bool
17384 find_nearest_reg_def (rtx insn, int regno1, int regno2)
17386 rtx prev = insn;
17387 rtx start = BB_HEAD (BLOCK_FOR_INSN (insn));
17389 if (insn == start)
17390 return false;
17391 while (prev && prev != start)
17393 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
17395 prev = PREV_INSN (prev);
17396 continue;
17398 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
17399 return true;
17400 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
17401 return false;
17402 prev = PREV_INSN (prev);
17405 /* None of the regs is defined in the bb. */
17406 return false;
17409 /* Split lea instructions into a sequence of instructions
17410 which are executed on ALU to avoid AGU stalls.
17411 It is assumed that it is allowed to clobber flags register
17412 at lea position. */
17414 void
17415 ix86_split_lea_for_addr (rtx insn, rtx operands[], enum machine_mode mode)
17417 unsigned int regno0, regno1, regno2;
17418 struct ix86_address parts;
17419 rtx target, tmp;
17420 int ok, adds;
17422 ok = ix86_decompose_address (operands[1], &parts);
17423 gcc_assert (ok);
17425 target = gen_lowpart (mode, operands[0]);
17427 regno0 = true_regnum (target);
17428 regno1 = INVALID_REGNUM;
17429 regno2 = INVALID_REGNUM;
17431 if (parts.base)
17433 parts.base = gen_lowpart (mode, parts.base);
17434 regno1 = true_regnum (parts.base);
17437 if (parts.index)
17439 parts.index = gen_lowpart (mode, parts.index);
17440 regno2 = true_regnum (parts.index);
17443 if (parts.disp)
17444 parts.disp = gen_lowpart (mode, parts.disp);
17446 if (parts.scale > 1)
17448 /* Case r1 = r1 + ... */
17449 if (regno1 == regno0)
17451 /* If we have a case r1 = r1 + C * r1 then we
17452 should use multiplication which is very
17453 expensive. Assume cost model is wrong if we
17454 have such case here. */
17455 gcc_assert (regno2 != regno0);
17457 for (adds = parts.scale; adds > 0; adds--)
17458 ix86_emit_binop (PLUS, mode, target, parts.index);
17460 else
17462 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
17463 if (regno0 != regno2)
17464 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
17466 /* Use shift for scaling. */
17467 ix86_emit_binop (ASHIFT, mode, target,
17468 GEN_INT (exact_log2 (parts.scale)));
17470 if (parts.base)
17471 ix86_emit_binop (PLUS, mode, target, parts.base);
17473 if (parts.disp && parts.disp != const0_rtx)
17474 ix86_emit_binop (PLUS, mode, target, parts.disp);
17477 else if (!parts.base && !parts.index)
17479 gcc_assert(parts.disp);
17480 emit_insn (gen_rtx_SET (VOIDmode, target, parts.disp));
17482 else
17484 if (!parts.base)
17486 if (regno0 != regno2)
17487 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
17489 else if (!parts.index)
17491 if (regno0 != regno1)
17492 emit_insn (gen_rtx_SET (VOIDmode, target, parts.base));
17494 else
17496 if (regno0 == regno1)
17497 tmp = parts.index;
17498 else if (regno0 == regno2)
17499 tmp = parts.base;
17500 else
17502 rtx tmp1;
17504 /* Find better operand for SET instruction, depending
17505 on which definition is farther from the insn. */
17506 if (find_nearest_reg_def (insn, regno1, regno2))
17507 tmp = parts.index, tmp1 = parts.base;
17508 else
17509 tmp = parts.base, tmp1 = parts.index;
17511 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
17513 if (parts.disp && parts.disp != const0_rtx)
17514 ix86_emit_binop (PLUS, mode, target, parts.disp);
17516 ix86_emit_binop (PLUS, mode, target, tmp1);
17517 return;
17520 ix86_emit_binop (PLUS, mode, target, tmp);
17523 if (parts.disp && parts.disp != const0_rtx)
17524 ix86_emit_binop (PLUS, mode, target, parts.disp);
17528 /* Return true if it is ok to optimize an ADD operation to LEA
17529 operation to avoid flag register consumation. For most processors,
17530 ADD is faster than LEA. For the processors like ATOM, if the
17531 destination register of LEA holds an actual address which will be
17532 used soon, LEA is better and otherwise ADD is better. */
17534 bool
17535 ix86_lea_for_add_ok (rtx insn, rtx operands[])
17537 unsigned int regno0 = true_regnum (operands[0]);
17538 unsigned int regno1 = true_regnum (operands[1]);
17539 unsigned int regno2 = true_regnum (operands[2]);
17541 /* If a = b + c, (a!=b && a!=c), must use lea form. */
17542 if (regno0 != regno1 && regno0 != regno2)
17543 return true;
17545 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17546 return false;
17548 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0);
17551 /* Return true if destination reg of SET_BODY is shift count of
17552 USE_BODY. */
17554 static bool
17555 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
17557 rtx set_dest;
17558 rtx shift_rtx;
17559 int i;
17561 /* Retrieve destination of SET_BODY. */
17562 switch (GET_CODE (set_body))
17564 case SET:
17565 set_dest = SET_DEST (set_body);
17566 if (!set_dest || !REG_P (set_dest))
17567 return false;
17568 break;
17569 case PARALLEL:
17570 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
17571 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
17572 use_body))
17573 return true;
17574 default:
17575 return false;
17576 break;
17579 /* Retrieve shift count of USE_BODY. */
17580 switch (GET_CODE (use_body))
17582 case SET:
17583 shift_rtx = XEXP (use_body, 1);
17584 break;
17585 case PARALLEL:
17586 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
17587 if (ix86_dep_by_shift_count_body (set_body,
17588 XVECEXP (use_body, 0, i)))
17589 return true;
17590 default:
17591 return false;
17592 break;
17595 if (shift_rtx
17596 && (GET_CODE (shift_rtx) == ASHIFT
17597 || GET_CODE (shift_rtx) == LSHIFTRT
17598 || GET_CODE (shift_rtx) == ASHIFTRT
17599 || GET_CODE (shift_rtx) == ROTATE
17600 || GET_CODE (shift_rtx) == ROTATERT))
17602 rtx shift_count = XEXP (shift_rtx, 1);
17604 /* Return true if shift count is dest of SET_BODY. */
17605 if (REG_P (shift_count))
17607 /* Add check since it can be invoked before register
17608 allocation in pre-reload schedule. */
17609 if (reload_completed
17610 && true_regnum (set_dest) == true_regnum (shift_count))
17611 return true;
17612 else if (REGNO(set_dest) == REGNO(shift_count))
17613 return true;
17617 return false;
17620 /* Return true if destination reg of SET_INSN is shift count of
17621 USE_INSN. */
17623 bool
17624 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
17626 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
17627 PATTERN (use_insn));
17630 /* Return TRUE or FALSE depending on whether the unary operator meets the
17631 appropriate constraints. */
17633 bool
17634 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
17635 enum machine_mode mode ATTRIBUTE_UNUSED,
17636 rtx operands[2] ATTRIBUTE_UNUSED)
17638 /* If one of operands is memory, source and destination must match. */
17639 if ((MEM_P (operands[0])
17640 || MEM_P (operands[1]))
17641 && ! rtx_equal_p (operands[0], operands[1]))
17642 return false;
17643 return true;
17646 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
17647 are ok, keeping in mind the possible movddup alternative. */
17649 bool
17650 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
17652 if (MEM_P (operands[0]))
17653 return rtx_equal_p (operands[0], operands[1 + high]);
17654 if (MEM_P (operands[1]) && MEM_P (operands[2]))
17655 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
17656 return true;
17659 /* Post-reload splitter for converting an SF or DFmode value in an
17660 SSE register into an unsigned SImode. */
17662 void
17663 ix86_split_convert_uns_si_sse (rtx operands[])
17665 enum machine_mode vecmode;
17666 rtx value, large, zero_or_two31, input, two31, x;
17668 large = operands[1];
17669 zero_or_two31 = operands[2];
17670 input = operands[3];
17671 two31 = operands[4];
17672 vecmode = GET_MODE (large);
17673 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
17675 /* Load up the value into the low element. We must ensure that the other
17676 elements are valid floats -- zero is the easiest such value. */
17677 if (MEM_P (input))
17679 if (vecmode == V4SFmode)
17680 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
17681 else
17682 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
17684 else
17686 input = gen_rtx_REG (vecmode, REGNO (input));
17687 emit_move_insn (value, CONST0_RTX (vecmode));
17688 if (vecmode == V4SFmode)
17689 emit_insn (gen_sse_movss (value, value, input));
17690 else
17691 emit_insn (gen_sse2_movsd (value, value, input));
17694 emit_move_insn (large, two31);
17695 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
17697 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
17698 emit_insn (gen_rtx_SET (VOIDmode, large, x));
17700 x = gen_rtx_AND (vecmode, zero_or_two31, large);
17701 emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
17703 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
17704 emit_insn (gen_rtx_SET (VOIDmode, value, x));
17706 large = gen_rtx_REG (V4SImode, REGNO (large));
17707 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
17709 x = gen_rtx_REG (V4SImode, REGNO (value));
17710 if (vecmode == V4SFmode)
17711 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
17712 else
17713 emit_insn (gen_sse2_cvttpd2dq (x, value));
17714 value = x;
17716 emit_insn (gen_xorv4si3 (value, value, large));
17719 /* Convert an unsigned DImode value into a DFmode, using only SSE.
17720 Expects the 64-bit DImode to be supplied in a pair of integral
17721 registers. Requires SSE2; will use SSE3 if available. For x86_32,
17722 -mfpmath=sse, !optimize_size only. */
17724 void
17725 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
17727 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
17728 rtx int_xmm, fp_xmm;
17729 rtx biases, exponents;
17730 rtx x;
17732 int_xmm = gen_reg_rtx (V4SImode);
17733 if (TARGET_INTER_UNIT_MOVES)
17734 emit_insn (gen_movdi_to_sse (int_xmm, input));
17735 else if (TARGET_SSE_SPLIT_REGS)
17737 emit_clobber (int_xmm);
17738 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
17740 else
17742 x = gen_reg_rtx (V2DImode);
17743 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
17744 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
17747 x = gen_rtx_CONST_VECTOR (V4SImode,
17748 gen_rtvec (4, GEN_INT (0x43300000UL),
17749 GEN_INT (0x45300000UL),
17750 const0_rtx, const0_rtx));
17751 exponents = validize_mem (force_const_mem (V4SImode, x));
17753 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
17754 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
17756 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
17757 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
17758 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
17759 (0x1.0p84 + double(fp_value_hi_xmm)).
17760 Note these exponents differ by 32. */
17762 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
17764 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
17765 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
17766 real_ldexp (&bias_lo_rvt, &dconst1, 52);
17767 real_ldexp (&bias_hi_rvt, &dconst1, 84);
17768 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
17769 x = const_double_from_real_value (bias_hi_rvt, DFmode);
17770 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
17771 biases = validize_mem (force_const_mem (V2DFmode, biases));
17772 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
17774 /* Add the upper and lower DFmode values together. */
17775 if (TARGET_SSE3)
17776 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
17777 else
17779 x = copy_to_mode_reg (V2DFmode, fp_xmm);
17780 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
17781 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
17784 ix86_expand_vector_extract (false, target, fp_xmm, 0);
17787 /* Not used, but eases macroization of patterns. */
17788 void
17789 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
17790 rtx input ATTRIBUTE_UNUSED)
17792 gcc_unreachable ();
17795 /* Convert an unsigned SImode value into a DFmode. Only currently used
17796 for SSE, but applicable anywhere. */
17798 void
17799 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
17801 REAL_VALUE_TYPE TWO31r;
17802 rtx x, fp;
17804 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
17805 NULL, 1, OPTAB_DIRECT);
17807 fp = gen_reg_rtx (DFmode);
17808 emit_insn (gen_floatsidf2 (fp, x));
17810 real_ldexp (&TWO31r, &dconst1, 31);
17811 x = const_double_from_real_value (TWO31r, DFmode);
17813 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
17814 if (x != target)
17815 emit_move_insn (target, x);
17818 /* Convert a signed DImode value into a DFmode. Only used for SSE in
17819 32-bit mode; otherwise we have a direct convert instruction. */
17821 void
17822 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
17824 REAL_VALUE_TYPE TWO32r;
17825 rtx fp_lo, fp_hi, x;
17827 fp_lo = gen_reg_rtx (DFmode);
17828 fp_hi = gen_reg_rtx (DFmode);
17830 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
17832 real_ldexp (&TWO32r, &dconst1, 32);
17833 x = const_double_from_real_value (TWO32r, DFmode);
17834 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
17836 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
17838 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
17839 0, OPTAB_DIRECT);
17840 if (x != target)
17841 emit_move_insn (target, x);
17844 /* Convert an unsigned SImode value into a SFmode, using only SSE.
17845 For x86_32, -mfpmath=sse, !optimize_size only. */
17846 void
17847 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
17849 REAL_VALUE_TYPE ONE16r;
17850 rtx fp_hi, fp_lo, int_hi, int_lo, x;
17852 real_ldexp (&ONE16r, &dconst1, 16);
17853 x = const_double_from_real_value (ONE16r, SFmode);
17854 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
17855 NULL, 0, OPTAB_DIRECT);
17856 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
17857 NULL, 0, OPTAB_DIRECT);
17858 fp_hi = gen_reg_rtx (SFmode);
17859 fp_lo = gen_reg_rtx (SFmode);
17860 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
17861 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
17862 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
17863 0, OPTAB_DIRECT);
17864 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
17865 0, OPTAB_DIRECT);
17866 if (!rtx_equal_p (target, fp_hi))
17867 emit_move_insn (target, fp_hi);
17870 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
17871 a vector of unsigned ints VAL to vector of floats TARGET. */
17873 void
17874 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
17876 rtx tmp[8];
17877 REAL_VALUE_TYPE TWO16r;
17878 enum machine_mode intmode = GET_MODE (val);
17879 enum machine_mode fltmode = GET_MODE (target);
17880 rtx (*cvt) (rtx, rtx);
17882 if (intmode == V4SImode)
17883 cvt = gen_floatv4siv4sf2;
17884 else
17885 cvt = gen_floatv8siv8sf2;
17886 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
17887 tmp[0] = force_reg (intmode, tmp[0]);
17888 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
17889 OPTAB_DIRECT);
17890 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
17891 NULL_RTX, 1, OPTAB_DIRECT);
17892 tmp[3] = gen_reg_rtx (fltmode);
17893 emit_insn (cvt (tmp[3], tmp[1]));
17894 tmp[4] = gen_reg_rtx (fltmode);
17895 emit_insn (cvt (tmp[4], tmp[2]));
17896 real_ldexp (&TWO16r, &dconst1, 16);
17897 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
17898 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
17899 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
17900 OPTAB_DIRECT);
17901 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
17902 OPTAB_DIRECT);
17903 if (tmp[7] != target)
17904 emit_move_insn (target, tmp[7]);
17907 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
17908 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
17909 This is done by doing just signed conversion if < 0x1p31, and otherwise by
17910 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
17913 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
17915 REAL_VALUE_TYPE TWO31r;
17916 rtx two31r, tmp[4];
17917 enum machine_mode mode = GET_MODE (val);
17918 enum machine_mode scalarmode = GET_MODE_INNER (mode);
17919 enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
17920 rtx (*cmp) (rtx, rtx, rtx, rtx);
17921 int i;
17923 for (i = 0; i < 3; i++)
17924 tmp[i] = gen_reg_rtx (mode);
17925 real_ldexp (&TWO31r, &dconst1, 31);
17926 two31r = const_double_from_real_value (TWO31r, scalarmode);
17927 two31r = ix86_build_const_vector (mode, 1, two31r);
17928 two31r = force_reg (mode, two31r);
17929 switch (mode)
17931 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
17932 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
17933 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
17934 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
17935 default: gcc_unreachable ();
17937 tmp[3] = gen_rtx_LE (mode, two31r, val);
17938 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
17939 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
17940 0, OPTAB_DIRECT);
17941 if (intmode == V4SImode || TARGET_AVX2)
17942 *xorp = expand_simple_binop (intmode, ASHIFT,
17943 gen_lowpart (intmode, tmp[0]),
17944 GEN_INT (31), NULL_RTX, 0,
17945 OPTAB_DIRECT);
17946 else
17948 rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
17949 two31 = ix86_build_const_vector (intmode, 1, two31);
17950 *xorp = expand_simple_binop (intmode, AND,
17951 gen_lowpart (intmode, tmp[0]),
17952 two31, NULL_RTX, 0,
17953 OPTAB_DIRECT);
17955 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
17956 0, OPTAB_DIRECT);
17959 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
17960 then replicate the value for all elements of the vector
17961 register. */
17964 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
17966 int i, n_elt;
17967 rtvec v;
17968 enum machine_mode scalar_mode;
17970 switch (mode)
17972 case V32QImode:
17973 case V16QImode:
17974 case V16HImode:
17975 case V8HImode:
17976 case V8SImode:
17977 case V4SImode:
17978 case V4DImode:
17979 case V2DImode:
17980 gcc_assert (vect);
17981 case V8SFmode:
17982 case V4SFmode:
17983 case V4DFmode:
17984 case V2DFmode:
17985 n_elt = GET_MODE_NUNITS (mode);
17986 v = rtvec_alloc (n_elt);
17987 scalar_mode = GET_MODE_INNER (mode);
17989 RTVEC_ELT (v, 0) = value;
17991 for (i = 1; i < n_elt; ++i)
17992 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
17994 return gen_rtx_CONST_VECTOR (mode, v);
17996 default:
17997 gcc_unreachable ();
18001 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
18002 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
18003 for an SSE register. If VECT is true, then replicate the mask for
18004 all elements of the vector register. If INVERT is true, then create
18005 a mask excluding the sign bit. */
18008 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
18010 enum machine_mode vec_mode, imode;
18011 HOST_WIDE_INT hi, lo;
18012 int shift = 63;
18013 rtx v;
18014 rtx mask;
18016 /* Find the sign bit, sign extended to 2*HWI. */
18017 switch (mode)
18019 case V8SImode:
18020 case V4SImode:
18021 case V8SFmode:
18022 case V4SFmode:
18023 vec_mode = mode;
18024 mode = GET_MODE_INNER (mode);
18025 imode = SImode;
18026 lo = 0x80000000, hi = lo < 0;
18027 break;
18029 case V4DImode:
18030 case V2DImode:
18031 case V4DFmode:
18032 case V2DFmode:
18033 vec_mode = mode;
18034 mode = GET_MODE_INNER (mode);
18035 imode = DImode;
18036 if (HOST_BITS_PER_WIDE_INT >= 64)
18037 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
18038 else
18039 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18040 break;
18042 case TImode:
18043 case TFmode:
18044 vec_mode = VOIDmode;
18045 if (HOST_BITS_PER_WIDE_INT >= 64)
18047 imode = TImode;
18048 lo = 0, hi = (HOST_WIDE_INT)1 << shift;
18050 else
18052 rtvec vec;
18054 imode = DImode;
18055 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18057 if (invert)
18059 lo = ~lo, hi = ~hi;
18060 v = constm1_rtx;
18062 else
18063 v = const0_rtx;
18065 mask = immed_double_const (lo, hi, imode);
18067 vec = gen_rtvec (2, v, mask);
18068 v = gen_rtx_CONST_VECTOR (V2DImode, vec);
18069 v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
18071 return v;
18073 break;
18075 default:
18076 gcc_unreachable ();
18079 if (invert)
18080 lo = ~lo, hi = ~hi;
18082 /* Force this value into the low part of a fp vector constant. */
18083 mask = immed_double_const (lo, hi, imode);
18084 mask = gen_lowpart (mode, mask);
18086 if (vec_mode == VOIDmode)
18087 return force_reg (mode, mask);
18089 v = ix86_build_const_vector (vec_mode, vect, mask);
18090 return force_reg (vec_mode, v);
18093 /* Generate code for floating point ABS or NEG. */
18095 void
18096 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
18097 rtx operands[])
18099 rtx mask, set, dst, src;
18100 bool use_sse = false;
18101 bool vector_mode = VECTOR_MODE_P (mode);
18102 enum machine_mode vmode = mode;
18104 if (vector_mode)
18105 use_sse = true;
18106 else if (mode == TFmode)
18107 use_sse = true;
18108 else if (TARGET_SSE_MATH)
18110 use_sse = SSE_FLOAT_MODE_P (mode);
18111 if (mode == SFmode)
18112 vmode = V4SFmode;
18113 else if (mode == DFmode)
18114 vmode = V2DFmode;
18117 /* NEG and ABS performed with SSE use bitwise mask operations.
18118 Create the appropriate mask now. */
18119 if (use_sse)
18120 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
18121 else
18122 mask = NULL_RTX;
18124 dst = operands[0];
18125 src = operands[1];
18127 set = gen_rtx_fmt_e (code, mode, src);
18128 set = gen_rtx_SET (VOIDmode, dst, set);
18130 if (mask)
18132 rtx use, clob;
18133 rtvec par;
18135 use = gen_rtx_USE (VOIDmode, mask);
18136 if (vector_mode)
18137 par = gen_rtvec (2, set, use);
18138 else
18140 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
18141 par = gen_rtvec (3, set, use, clob);
18143 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
18145 else
18146 emit_insn (set);
18149 /* Expand a copysign operation. Special case operand 0 being a constant. */
18151 void
18152 ix86_expand_copysign (rtx operands[])
18154 enum machine_mode mode, vmode;
18155 rtx dest, op0, op1, mask, nmask;
18157 dest = operands[0];
18158 op0 = operands[1];
18159 op1 = operands[2];
18161 mode = GET_MODE (dest);
18163 if (mode == SFmode)
18164 vmode = V4SFmode;
18165 else if (mode == DFmode)
18166 vmode = V2DFmode;
18167 else
18168 vmode = mode;
18170 if (GET_CODE (op0) == CONST_DOUBLE)
18172 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
18174 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
18175 op0 = simplify_unary_operation (ABS, mode, op0, mode);
18177 if (mode == SFmode || mode == DFmode)
18179 if (op0 == CONST0_RTX (mode))
18180 op0 = CONST0_RTX (vmode);
18181 else
18183 rtx v = ix86_build_const_vector (vmode, false, op0);
18185 op0 = force_reg (vmode, v);
18188 else if (op0 != CONST0_RTX (mode))
18189 op0 = force_reg (mode, op0);
18191 mask = ix86_build_signbit_mask (vmode, 0, 0);
18193 if (mode == SFmode)
18194 copysign_insn = gen_copysignsf3_const;
18195 else if (mode == DFmode)
18196 copysign_insn = gen_copysigndf3_const;
18197 else
18198 copysign_insn = gen_copysigntf3_const;
18200 emit_insn (copysign_insn (dest, op0, op1, mask));
18202 else
18204 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
18206 nmask = ix86_build_signbit_mask (vmode, 0, 1);
18207 mask = ix86_build_signbit_mask (vmode, 0, 0);
18209 if (mode == SFmode)
18210 copysign_insn = gen_copysignsf3_var;
18211 else if (mode == DFmode)
18212 copysign_insn = gen_copysigndf3_var;
18213 else
18214 copysign_insn = gen_copysigntf3_var;
18216 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
18220 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
18221 be a constant, and so has already been expanded into a vector constant. */
18223 void
18224 ix86_split_copysign_const (rtx operands[])
18226 enum machine_mode mode, vmode;
18227 rtx dest, op0, mask, x;
18229 dest = operands[0];
18230 op0 = operands[1];
18231 mask = operands[3];
18233 mode = GET_MODE (dest);
18234 vmode = GET_MODE (mask);
18236 dest = simplify_gen_subreg (vmode, dest, mode, 0);
18237 x = gen_rtx_AND (vmode, dest, mask);
18238 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18240 if (op0 != CONST0_RTX (vmode))
18242 x = gen_rtx_IOR (vmode, dest, op0);
18243 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18247 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
18248 so we have to do two masks. */
18250 void
18251 ix86_split_copysign_var (rtx operands[])
18253 enum machine_mode mode, vmode;
18254 rtx dest, scratch, op0, op1, mask, nmask, x;
18256 dest = operands[0];
18257 scratch = operands[1];
18258 op0 = operands[2];
18259 op1 = operands[3];
18260 nmask = operands[4];
18261 mask = operands[5];
18263 mode = GET_MODE (dest);
18264 vmode = GET_MODE (mask);
18266 if (rtx_equal_p (op0, op1))
18268 /* Shouldn't happen often (it's useless, obviously), but when it does
18269 we'd generate incorrect code if we continue below. */
18270 emit_move_insn (dest, op0);
18271 return;
18274 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
18276 gcc_assert (REGNO (op1) == REGNO (scratch));
18278 x = gen_rtx_AND (vmode, scratch, mask);
18279 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18281 dest = mask;
18282 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18283 x = gen_rtx_NOT (vmode, dest);
18284 x = gen_rtx_AND (vmode, x, op0);
18285 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18287 else
18289 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
18291 x = gen_rtx_AND (vmode, scratch, mask);
18293 else /* alternative 2,4 */
18295 gcc_assert (REGNO (mask) == REGNO (scratch));
18296 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
18297 x = gen_rtx_AND (vmode, scratch, op1);
18299 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18301 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
18303 dest = simplify_gen_subreg (vmode, op0, mode, 0);
18304 x = gen_rtx_AND (vmode, dest, nmask);
18306 else /* alternative 3,4 */
18308 gcc_assert (REGNO (nmask) == REGNO (dest));
18309 dest = nmask;
18310 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18311 x = gen_rtx_AND (vmode, dest, op0);
18313 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18316 x = gen_rtx_IOR (vmode, dest, scratch);
18317 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18320 /* Return TRUE or FALSE depending on whether the first SET in INSN
18321 has source and destination with matching CC modes, and that the
18322 CC mode is at least as constrained as REQ_MODE. */
18324 bool
18325 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
18327 rtx set;
18328 enum machine_mode set_mode;
18330 set = PATTERN (insn);
18331 if (GET_CODE (set) == PARALLEL)
18332 set = XVECEXP (set, 0, 0);
18333 gcc_assert (GET_CODE (set) == SET);
18334 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
18336 set_mode = GET_MODE (SET_DEST (set));
18337 switch (set_mode)
18339 case CCNOmode:
18340 if (req_mode != CCNOmode
18341 && (req_mode != CCmode
18342 || XEXP (SET_SRC (set), 1) != const0_rtx))
18343 return false;
18344 break;
18345 case CCmode:
18346 if (req_mode == CCGCmode)
18347 return false;
18348 /* FALLTHRU */
18349 case CCGCmode:
18350 if (req_mode == CCGOCmode || req_mode == CCNOmode)
18351 return false;
18352 /* FALLTHRU */
18353 case CCGOCmode:
18354 if (req_mode == CCZmode)
18355 return false;
18356 /* FALLTHRU */
18357 case CCZmode:
18358 break;
18360 case CCAmode:
18361 case CCCmode:
18362 case CCOmode:
18363 case CCSmode:
18364 if (set_mode != req_mode)
18365 return false;
18366 break;
18368 default:
18369 gcc_unreachable ();
18372 return GET_MODE (SET_SRC (set)) == set_mode;
18375 /* Generate insn patterns to do an integer compare of OPERANDS. */
18377 static rtx
18378 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
18380 enum machine_mode cmpmode;
18381 rtx tmp, flags;
18383 cmpmode = SELECT_CC_MODE (code, op0, op1);
18384 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
18386 /* This is very simple, but making the interface the same as in the
18387 FP case makes the rest of the code easier. */
18388 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
18389 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
18391 /* Return the test that should be put into the flags user, i.e.
18392 the bcc, scc, or cmov instruction. */
18393 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
18396 /* Figure out whether to use ordered or unordered fp comparisons.
18397 Return the appropriate mode to use. */
18399 enum machine_mode
18400 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
18402 /* ??? In order to make all comparisons reversible, we do all comparisons
18403 non-trapping when compiling for IEEE. Once gcc is able to distinguish
18404 all forms trapping and nontrapping comparisons, we can make inequality
18405 comparisons trapping again, since it results in better code when using
18406 FCOM based compares. */
18407 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
18410 enum machine_mode
18411 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
18413 enum machine_mode mode = GET_MODE (op0);
18415 if (SCALAR_FLOAT_MODE_P (mode))
18417 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18418 return ix86_fp_compare_mode (code);
18421 switch (code)
18423 /* Only zero flag is needed. */
18424 case EQ: /* ZF=0 */
18425 case NE: /* ZF!=0 */
18426 return CCZmode;
18427 /* Codes needing carry flag. */
18428 case GEU: /* CF=0 */
18429 case LTU: /* CF=1 */
18430 /* Detect overflow checks. They need just the carry flag. */
18431 if (GET_CODE (op0) == PLUS
18432 && rtx_equal_p (op1, XEXP (op0, 0)))
18433 return CCCmode;
18434 else
18435 return CCmode;
18436 case GTU: /* CF=0 & ZF=0 */
18437 case LEU: /* CF=1 | ZF=1 */
18438 /* Detect overflow checks. They need just the carry flag. */
18439 if (GET_CODE (op0) == MINUS
18440 && rtx_equal_p (op1, XEXP (op0, 0)))
18441 return CCCmode;
18442 else
18443 return CCmode;
18444 /* Codes possibly doable only with sign flag when
18445 comparing against zero. */
18446 case GE: /* SF=OF or SF=0 */
18447 case LT: /* SF<>OF or SF=1 */
18448 if (op1 == const0_rtx)
18449 return CCGOCmode;
18450 else
18451 /* For other cases Carry flag is not required. */
18452 return CCGCmode;
18453 /* Codes doable only with sign flag when comparing
18454 against zero, but we miss jump instruction for it
18455 so we need to use relational tests against overflow
18456 that thus needs to be zero. */
18457 case GT: /* ZF=0 & SF=OF */
18458 case LE: /* ZF=1 | SF<>OF */
18459 if (op1 == const0_rtx)
18460 return CCNOmode;
18461 else
18462 return CCGCmode;
18463 /* strcmp pattern do (use flags) and combine may ask us for proper
18464 mode. */
18465 case USE:
18466 return CCmode;
18467 default:
18468 gcc_unreachable ();
18472 /* Return the fixed registers used for condition codes. */
18474 static bool
18475 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
18477 *p1 = FLAGS_REG;
18478 *p2 = FPSR_REG;
18479 return true;
18482 /* If two condition code modes are compatible, return a condition code
18483 mode which is compatible with both. Otherwise, return
18484 VOIDmode. */
18486 static enum machine_mode
18487 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
18489 if (m1 == m2)
18490 return m1;
18492 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
18493 return VOIDmode;
18495 if ((m1 == CCGCmode && m2 == CCGOCmode)
18496 || (m1 == CCGOCmode && m2 == CCGCmode))
18497 return CCGCmode;
18499 if (m1 == CCZmode && (m2 == CCGCmode || m2 == CCGOCmode))
18500 return m2;
18501 else if (m2 == CCZmode && (m1 == CCGCmode || m1 == CCGOCmode))
18502 return m1;
18504 switch (m1)
18506 default:
18507 gcc_unreachable ();
18509 case CCmode:
18510 case CCGCmode:
18511 case CCGOCmode:
18512 case CCNOmode:
18513 case CCAmode:
18514 case CCCmode:
18515 case CCOmode:
18516 case CCSmode:
18517 case CCZmode:
18518 switch (m2)
18520 default:
18521 return VOIDmode;
18523 case CCmode:
18524 case CCGCmode:
18525 case CCGOCmode:
18526 case CCNOmode:
18527 case CCAmode:
18528 case CCCmode:
18529 case CCOmode:
18530 case CCSmode:
18531 case CCZmode:
18532 return CCmode;
18535 case CCFPmode:
18536 case CCFPUmode:
18537 /* These are only compatible with themselves, which we already
18538 checked above. */
18539 return VOIDmode;
18544 /* Return a comparison we can do and that it is equivalent to
18545 swap_condition (code) apart possibly from orderedness.
18546 But, never change orderedness if TARGET_IEEE_FP, returning
18547 UNKNOWN in that case if necessary. */
18549 static enum rtx_code
18550 ix86_fp_swap_condition (enum rtx_code code)
18552 switch (code)
18554 case GT: /* GTU - CF=0 & ZF=0 */
18555 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
18556 case GE: /* GEU - CF=0 */
18557 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
18558 case UNLT: /* LTU - CF=1 */
18559 return TARGET_IEEE_FP ? UNKNOWN : GT;
18560 case UNLE: /* LEU - CF=1 | ZF=1 */
18561 return TARGET_IEEE_FP ? UNKNOWN : GE;
18562 default:
18563 return swap_condition (code);
18567 /* Return cost of comparison CODE using the best strategy for performance.
18568 All following functions do use number of instructions as a cost metrics.
18569 In future this should be tweaked to compute bytes for optimize_size and
18570 take into account performance of various instructions on various CPUs. */
18572 static int
18573 ix86_fp_comparison_cost (enum rtx_code code)
18575 int arith_cost;
18577 /* The cost of code using bit-twiddling on %ah. */
18578 switch (code)
18580 case UNLE:
18581 case UNLT:
18582 case LTGT:
18583 case GT:
18584 case GE:
18585 case UNORDERED:
18586 case ORDERED:
18587 case UNEQ:
18588 arith_cost = 4;
18589 break;
18590 case LT:
18591 case NE:
18592 case EQ:
18593 case UNGE:
18594 arith_cost = TARGET_IEEE_FP ? 5 : 4;
18595 break;
18596 case LE:
18597 case UNGT:
18598 arith_cost = TARGET_IEEE_FP ? 6 : 4;
18599 break;
18600 default:
18601 gcc_unreachable ();
18604 switch (ix86_fp_comparison_strategy (code))
18606 case IX86_FPCMP_COMI:
18607 return arith_cost > 4 ? 3 : 2;
18608 case IX86_FPCMP_SAHF:
18609 return arith_cost > 4 ? 4 : 3;
18610 default:
18611 return arith_cost;
18615 /* Return strategy to use for floating-point. We assume that fcomi is always
18616 preferrable where available, since that is also true when looking at size
18617 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
18619 enum ix86_fpcmp_strategy
18620 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
18622 /* Do fcomi/sahf based test when profitable. */
18624 if (TARGET_CMOVE)
18625 return IX86_FPCMP_COMI;
18627 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_function_for_size_p (cfun)))
18628 return IX86_FPCMP_SAHF;
18630 return IX86_FPCMP_ARITH;
18633 /* Swap, force into registers, or otherwise massage the two operands
18634 to a fp comparison. The operands are updated in place; the new
18635 comparison code is returned. */
18637 static enum rtx_code
18638 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
18640 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
18641 rtx op0 = *pop0, op1 = *pop1;
18642 enum machine_mode op_mode = GET_MODE (op0);
18643 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
18645 /* All of the unordered compare instructions only work on registers.
18646 The same is true of the fcomi compare instructions. The XFmode
18647 compare instructions require registers except when comparing
18648 against zero or when converting operand 1 from fixed point to
18649 floating point. */
18651 if (!is_sse
18652 && (fpcmp_mode == CCFPUmode
18653 || (op_mode == XFmode
18654 && ! (standard_80387_constant_p (op0) == 1
18655 || standard_80387_constant_p (op1) == 1)
18656 && GET_CODE (op1) != FLOAT)
18657 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
18659 op0 = force_reg (op_mode, op0);
18660 op1 = force_reg (op_mode, op1);
18662 else
18664 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
18665 things around if they appear profitable, otherwise force op0
18666 into a register. */
18668 if (standard_80387_constant_p (op0) == 0
18669 || (MEM_P (op0)
18670 && ! (standard_80387_constant_p (op1) == 0
18671 || MEM_P (op1))))
18673 enum rtx_code new_code = ix86_fp_swap_condition (code);
18674 if (new_code != UNKNOWN)
18676 rtx tmp;
18677 tmp = op0, op0 = op1, op1 = tmp;
18678 code = new_code;
18682 if (!REG_P (op0))
18683 op0 = force_reg (op_mode, op0);
18685 if (CONSTANT_P (op1))
18687 int tmp = standard_80387_constant_p (op1);
18688 if (tmp == 0)
18689 op1 = validize_mem (force_const_mem (op_mode, op1));
18690 else if (tmp == 1)
18692 if (TARGET_CMOVE)
18693 op1 = force_reg (op_mode, op1);
18695 else
18696 op1 = force_reg (op_mode, op1);
18700 /* Try to rearrange the comparison to make it cheaper. */
18701 if (ix86_fp_comparison_cost (code)
18702 > ix86_fp_comparison_cost (swap_condition (code))
18703 && (REG_P (op1) || can_create_pseudo_p ()))
18705 rtx tmp;
18706 tmp = op0, op0 = op1, op1 = tmp;
18707 code = swap_condition (code);
18708 if (!REG_P (op0))
18709 op0 = force_reg (op_mode, op0);
18712 *pop0 = op0;
18713 *pop1 = op1;
18714 return code;
18717 /* Convert comparison codes we use to represent FP comparison to integer
18718 code that will result in proper branch. Return UNKNOWN if no such code
18719 is available. */
18721 enum rtx_code
18722 ix86_fp_compare_code_to_integer (enum rtx_code code)
18724 switch (code)
18726 case GT:
18727 return GTU;
18728 case GE:
18729 return GEU;
18730 case ORDERED:
18731 case UNORDERED:
18732 return code;
18733 break;
18734 case UNEQ:
18735 return EQ;
18736 break;
18737 case UNLT:
18738 return LTU;
18739 break;
18740 case UNLE:
18741 return LEU;
18742 break;
18743 case LTGT:
18744 return NE;
18745 break;
18746 default:
18747 return UNKNOWN;
18751 /* Generate insn patterns to do a floating point compare of OPERANDS. */
18753 static rtx
18754 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
18756 enum machine_mode fpcmp_mode, intcmp_mode;
18757 rtx tmp, tmp2;
18759 fpcmp_mode = ix86_fp_compare_mode (code);
18760 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
18762 /* Do fcomi/sahf based test when profitable. */
18763 switch (ix86_fp_comparison_strategy (code))
18765 case IX86_FPCMP_COMI:
18766 intcmp_mode = fpcmp_mode;
18767 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18768 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18769 tmp);
18770 emit_insn (tmp);
18771 break;
18773 case IX86_FPCMP_SAHF:
18774 intcmp_mode = fpcmp_mode;
18775 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18776 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18777 tmp);
18779 if (!scratch)
18780 scratch = gen_reg_rtx (HImode);
18781 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
18782 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
18783 break;
18785 case IX86_FPCMP_ARITH:
18786 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
18787 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18788 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
18789 if (!scratch)
18790 scratch = gen_reg_rtx (HImode);
18791 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
18793 /* In the unordered case, we have to check C2 for NaN's, which
18794 doesn't happen to work out to anything nice combination-wise.
18795 So do some bit twiddling on the value we've got in AH to come
18796 up with an appropriate set of condition codes. */
18798 intcmp_mode = CCNOmode;
18799 switch (code)
18801 case GT:
18802 case UNGT:
18803 if (code == GT || !TARGET_IEEE_FP)
18805 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18806 code = EQ;
18808 else
18810 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18811 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18812 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
18813 intcmp_mode = CCmode;
18814 code = GEU;
18816 break;
18817 case LT:
18818 case UNLT:
18819 if (code == LT && TARGET_IEEE_FP)
18821 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18822 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
18823 intcmp_mode = CCmode;
18824 code = EQ;
18826 else
18828 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
18829 code = NE;
18831 break;
18832 case GE:
18833 case UNGE:
18834 if (code == GE || !TARGET_IEEE_FP)
18836 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
18837 code = EQ;
18839 else
18841 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18842 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
18843 code = NE;
18845 break;
18846 case LE:
18847 case UNLE:
18848 if (code == LE && TARGET_IEEE_FP)
18850 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18851 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18852 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18853 intcmp_mode = CCmode;
18854 code = LTU;
18856 else
18858 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18859 code = NE;
18861 break;
18862 case EQ:
18863 case UNEQ:
18864 if (code == EQ && TARGET_IEEE_FP)
18866 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18867 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18868 intcmp_mode = CCmode;
18869 code = EQ;
18871 else
18873 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18874 code = NE;
18876 break;
18877 case NE:
18878 case LTGT:
18879 if (code == NE && TARGET_IEEE_FP)
18881 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18882 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
18883 GEN_INT (0x40)));
18884 code = NE;
18886 else
18888 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18889 code = EQ;
18891 break;
18893 case UNORDERED:
18894 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18895 code = NE;
18896 break;
18897 case ORDERED:
18898 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18899 code = EQ;
18900 break;
18902 default:
18903 gcc_unreachable ();
18905 break;
18907 default:
18908 gcc_unreachable();
18911 /* Return the test that should be put into the flags user, i.e.
18912 the bcc, scc, or cmov instruction. */
18913 return gen_rtx_fmt_ee (code, VOIDmode,
18914 gen_rtx_REG (intcmp_mode, FLAGS_REG),
18915 const0_rtx);
18918 static rtx
18919 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
18921 rtx ret;
18923 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
18924 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
18926 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
18928 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
18929 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18931 else
18932 ret = ix86_expand_int_compare (code, op0, op1);
18934 return ret;
18937 void
18938 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
18940 enum machine_mode mode = GET_MODE (op0);
18941 rtx tmp;
18943 switch (mode)
18945 case SFmode:
18946 case DFmode:
18947 case XFmode:
18948 case QImode:
18949 case HImode:
18950 case SImode:
18951 simple:
18952 tmp = ix86_expand_compare (code, op0, op1);
18953 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
18954 gen_rtx_LABEL_REF (VOIDmode, label),
18955 pc_rtx);
18956 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
18957 return;
18959 case DImode:
18960 if (TARGET_64BIT)
18961 goto simple;
18962 case TImode:
18963 /* Expand DImode branch into multiple compare+branch. */
18965 rtx lo[2], hi[2], label2;
18966 enum rtx_code code1, code2, code3;
18967 enum machine_mode submode;
18969 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
18971 tmp = op0, op0 = op1, op1 = tmp;
18972 code = swap_condition (code);
18975 split_double_mode (mode, &op0, 1, lo+0, hi+0);
18976 split_double_mode (mode, &op1, 1, lo+1, hi+1);
18978 submode = mode == DImode ? SImode : DImode;
18980 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
18981 avoid two branches. This costs one extra insn, so disable when
18982 optimizing for size. */
18984 if ((code == EQ || code == NE)
18985 && (!optimize_insn_for_size_p ()
18986 || hi[1] == const0_rtx || lo[1] == const0_rtx))
18988 rtx xor0, xor1;
18990 xor1 = hi[0];
18991 if (hi[1] != const0_rtx)
18992 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
18993 NULL_RTX, 0, OPTAB_WIDEN);
18995 xor0 = lo[0];
18996 if (lo[1] != const0_rtx)
18997 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
18998 NULL_RTX, 0, OPTAB_WIDEN);
19000 tmp = expand_binop (submode, ior_optab, xor1, xor0,
19001 NULL_RTX, 0, OPTAB_WIDEN);
19003 ix86_expand_branch (code, tmp, const0_rtx, label);
19004 return;
19007 /* Otherwise, if we are doing less-than or greater-or-equal-than,
19008 op1 is a constant and the low word is zero, then we can just
19009 examine the high word. Similarly for low word -1 and
19010 less-or-equal-than or greater-than. */
19012 if (CONST_INT_P (hi[1]))
19013 switch (code)
19015 case LT: case LTU: case GE: case GEU:
19016 if (lo[1] == const0_rtx)
19018 ix86_expand_branch (code, hi[0], hi[1], label);
19019 return;
19021 break;
19022 case LE: case LEU: case GT: case GTU:
19023 if (lo[1] == constm1_rtx)
19025 ix86_expand_branch (code, hi[0], hi[1], label);
19026 return;
19028 break;
19029 default:
19030 break;
19033 /* Otherwise, we need two or three jumps. */
19035 label2 = gen_label_rtx ();
19037 code1 = code;
19038 code2 = swap_condition (code);
19039 code3 = unsigned_condition (code);
19041 switch (code)
19043 case LT: case GT: case LTU: case GTU:
19044 break;
19046 case LE: code1 = LT; code2 = GT; break;
19047 case GE: code1 = GT; code2 = LT; break;
19048 case LEU: code1 = LTU; code2 = GTU; break;
19049 case GEU: code1 = GTU; code2 = LTU; break;
19051 case EQ: code1 = UNKNOWN; code2 = NE; break;
19052 case NE: code2 = UNKNOWN; break;
19054 default:
19055 gcc_unreachable ();
19059 * a < b =>
19060 * if (hi(a) < hi(b)) goto true;
19061 * if (hi(a) > hi(b)) goto false;
19062 * if (lo(a) < lo(b)) goto true;
19063 * false:
19066 if (code1 != UNKNOWN)
19067 ix86_expand_branch (code1, hi[0], hi[1], label);
19068 if (code2 != UNKNOWN)
19069 ix86_expand_branch (code2, hi[0], hi[1], label2);
19071 ix86_expand_branch (code3, lo[0], lo[1], label);
19073 if (code2 != UNKNOWN)
19074 emit_label (label2);
19075 return;
19078 default:
19079 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
19080 goto simple;
19084 /* Split branch based on floating point condition. */
19085 void
19086 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
19087 rtx target1, rtx target2, rtx tmp, rtx pushed)
19089 rtx condition;
19090 rtx i;
19092 if (target2 != pc_rtx)
19094 rtx tmp = target2;
19095 code = reverse_condition_maybe_unordered (code);
19096 target2 = target1;
19097 target1 = tmp;
19100 condition = ix86_expand_fp_compare (code, op1, op2,
19101 tmp);
19103 /* Remove pushed operand from stack. */
19104 if (pushed)
19105 ix86_free_from_memory (GET_MODE (pushed));
19107 i = emit_jump_insn (gen_rtx_SET
19108 (VOIDmode, pc_rtx,
19109 gen_rtx_IF_THEN_ELSE (VOIDmode,
19110 condition, target1, target2)));
19111 if (split_branch_probability >= 0)
19112 add_reg_note (i, REG_BR_PROB, GEN_INT (split_branch_probability));
19115 void
19116 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
19118 rtx ret;
19120 gcc_assert (GET_MODE (dest) == QImode);
19122 ret = ix86_expand_compare (code, op0, op1);
19123 PUT_MODE (ret, QImode);
19124 emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
19127 /* Expand comparison setting or clearing carry flag. Return true when
19128 successful and set pop for the operation. */
19129 static bool
19130 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
19132 enum machine_mode mode =
19133 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
19135 /* Do not handle double-mode compares that go through special path. */
19136 if (mode == (TARGET_64BIT ? TImode : DImode))
19137 return false;
19139 if (SCALAR_FLOAT_MODE_P (mode))
19141 rtx compare_op, compare_seq;
19143 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
19145 /* Shortcut: following common codes never translate
19146 into carry flag compares. */
19147 if (code == EQ || code == NE || code == UNEQ || code == LTGT
19148 || code == ORDERED || code == UNORDERED)
19149 return false;
19151 /* These comparisons require zero flag; swap operands so they won't. */
19152 if ((code == GT || code == UNLE || code == LE || code == UNGT)
19153 && !TARGET_IEEE_FP)
19155 rtx tmp = op0;
19156 op0 = op1;
19157 op1 = tmp;
19158 code = swap_condition (code);
19161 /* Try to expand the comparison and verify that we end up with
19162 carry flag based comparison. This fails to be true only when
19163 we decide to expand comparison using arithmetic that is not
19164 too common scenario. */
19165 start_sequence ();
19166 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
19167 compare_seq = get_insns ();
19168 end_sequence ();
19170 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
19171 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
19172 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
19173 else
19174 code = GET_CODE (compare_op);
19176 if (code != LTU && code != GEU)
19177 return false;
19179 emit_insn (compare_seq);
19180 *pop = compare_op;
19181 return true;
19184 if (!INTEGRAL_MODE_P (mode))
19185 return false;
19187 switch (code)
19189 case LTU:
19190 case GEU:
19191 break;
19193 /* Convert a==0 into (unsigned)a<1. */
19194 case EQ:
19195 case NE:
19196 if (op1 != const0_rtx)
19197 return false;
19198 op1 = const1_rtx;
19199 code = (code == EQ ? LTU : GEU);
19200 break;
19202 /* Convert a>b into b<a or a>=b-1. */
19203 case GTU:
19204 case LEU:
19205 if (CONST_INT_P (op1))
19207 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
19208 /* Bail out on overflow. We still can swap operands but that
19209 would force loading of the constant into register. */
19210 if (op1 == const0_rtx
19211 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
19212 return false;
19213 code = (code == GTU ? GEU : LTU);
19215 else
19217 rtx tmp = op1;
19218 op1 = op0;
19219 op0 = tmp;
19220 code = (code == GTU ? LTU : GEU);
19222 break;
19224 /* Convert a>=0 into (unsigned)a<0x80000000. */
19225 case LT:
19226 case GE:
19227 if (mode == DImode || op1 != const0_rtx)
19228 return false;
19229 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19230 code = (code == LT ? GEU : LTU);
19231 break;
19232 case LE:
19233 case GT:
19234 if (mode == DImode || op1 != constm1_rtx)
19235 return false;
19236 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19237 code = (code == LE ? GEU : LTU);
19238 break;
19240 default:
19241 return false;
19243 /* Swapping operands may cause constant to appear as first operand. */
19244 if (!nonimmediate_operand (op0, VOIDmode))
19246 if (!can_create_pseudo_p ())
19247 return false;
19248 op0 = force_reg (mode, op0);
19250 *pop = ix86_expand_compare (code, op0, op1);
19251 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
19252 return true;
19255 bool
19256 ix86_expand_int_movcc (rtx operands[])
19258 enum rtx_code code = GET_CODE (operands[1]), compare_code;
19259 rtx compare_seq, compare_op;
19260 enum machine_mode mode = GET_MODE (operands[0]);
19261 bool sign_bit_compare_p = false;
19262 rtx op0 = XEXP (operands[1], 0);
19263 rtx op1 = XEXP (operands[1], 1);
19265 if (GET_MODE (op0) == TImode
19266 || (GET_MODE (op0) == DImode
19267 && !TARGET_64BIT))
19268 return false;
19270 start_sequence ();
19271 compare_op = ix86_expand_compare (code, op0, op1);
19272 compare_seq = get_insns ();
19273 end_sequence ();
19275 compare_code = GET_CODE (compare_op);
19277 if ((op1 == const0_rtx && (code == GE || code == LT))
19278 || (op1 == constm1_rtx && (code == GT || code == LE)))
19279 sign_bit_compare_p = true;
19281 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
19282 HImode insns, we'd be swallowed in word prefix ops. */
19284 if ((mode != HImode || TARGET_FAST_PREFIX)
19285 && (mode != (TARGET_64BIT ? TImode : DImode))
19286 && CONST_INT_P (operands[2])
19287 && CONST_INT_P (operands[3]))
19289 rtx out = operands[0];
19290 HOST_WIDE_INT ct = INTVAL (operands[2]);
19291 HOST_WIDE_INT cf = INTVAL (operands[3]);
19292 HOST_WIDE_INT diff;
19294 diff = ct - cf;
19295 /* Sign bit compares are better done using shifts than we do by using
19296 sbb. */
19297 if (sign_bit_compare_p
19298 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
19300 /* Detect overlap between destination and compare sources. */
19301 rtx tmp = out;
19303 if (!sign_bit_compare_p)
19305 rtx flags;
19306 bool fpcmp = false;
19308 compare_code = GET_CODE (compare_op);
19310 flags = XEXP (compare_op, 0);
19312 if (GET_MODE (flags) == CCFPmode
19313 || GET_MODE (flags) == CCFPUmode)
19315 fpcmp = true;
19316 compare_code
19317 = ix86_fp_compare_code_to_integer (compare_code);
19320 /* To simplify rest of code, restrict to the GEU case. */
19321 if (compare_code == LTU)
19323 HOST_WIDE_INT tmp = ct;
19324 ct = cf;
19325 cf = tmp;
19326 compare_code = reverse_condition (compare_code);
19327 code = reverse_condition (code);
19329 else
19331 if (fpcmp)
19332 PUT_CODE (compare_op,
19333 reverse_condition_maybe_unordered
19334 (GET_CODE (compare_op)));
19335 else
19336 PUT_CODE (compare_op,
19337 reverse_condition (GET_CODE (compare_op)));
19339 diff = ct - cf;
19341 if (reg_overlap_mentioned_p (out, op0)
19342 || reg_overlap_mentioned_p (out, op1))
19343 tmp = gen_reg_rtx (mode);
19345 if (mode == DImode)
19346 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
19347 else
19348 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
19349 flags, compare_op));
19351 else
19353 if (code == GT || code == GE)
19354 code = reverse_condition (code);
19355 else
19357 HOST_WIDE_INT tmp = ct;
19358 ct = cf;
19359 cf = tmp;
19360 diff = ct - cf;
19362 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
19365 if (diff == 1)
19368 * cmpl op0,op1
19369 * sbbl dest,dest
19370 * [addl dest, ct]
19372 * Size 5 - 8.
19374 if (ct)
19375 tmp = expand_simple_binop (mode, PLUS,
19376 tmp, GEN_INT (ct),
19377 copy_rtx (tmp), 1, OPTAB_DIRECT);
19379 else if (cf == -1)
19382 * cmpl op0,op1
19383 * sbbl dest,dest
19384 * orl $ct, dest
19386 * Size 8.
19388 tmp = expand_simple_binop (mode, IOR,
19389 tmp, GEN_INT (ct),
19390 copy_rtx (tmp), 1, OPTAB_DIRECT);
19392 else if (diff == -1 && ct)
19395 * cmpl op0,op1
19396 * sbbl dest,dest
19397 * notl dest
19398 * [addl dest, cf]
19400 * Size 8 - 11.
19402 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19403 if (cf)
19404 tmp = expand_simple_binop (mode, PLUS,
19405 copy_rtx (tmp), GEN_INT (cf),
19406 copy_rtx (tmp), 1, OPTAB_DIRECT);
19408 else
19411 * cmpl op0,op1
19412 * sbbl dest,dest
19413 * [notl dest]
19414 * andl cf - ct, dest
19415 * [addl dest, ct]
19417 * Size 8 - 11.
19420 if (cf == 0)
19422 cf = ct;
19423 ct = 0;
19424 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19427 tmp = expand_simple_binop (mode, AND,
19428 copy_rtx (tmp),
19429 gen_int_mode (cf - ct, mode),
19430 copy_rtx (tmp), 1, OPTAB_DIRECT);
19431 if (ct)
19432 tmp = expand_simple_binop (mode, PLUS,
19433 copy_rtx (tmp), GEN_INT (ct),
19434 copy_rtx (tmp), 1, OPTAB_DIRECT);
19437 if (!rtx_equal_p (tmp, out))
19438 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
19440 return true;
19443 if (diff < 0)
19445 enum machine_mode cmp_mode = GET_MODE (op0);
19447 HOST_WIDE_INT tmp;
19448 tmp = ct, ct = cf, cf = tmp;
19449 diff = -diff;
19451 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19453 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19455 /* We may be reversing unordered compare to normal compare, that
19456 is not valid in general (we may convert non-trapping condition
19457 to trapping one), however on i386 we currently emit all
19458 comparisons unordered. */
19459 compare_code = reverse_condition_maybe_unordered (compare_code);
19460 code = reverse_condition_maybe_unordered (code);
19462 else
19464 compare_code = reverse_condition (compare_code);
19465 code = reverse_condition (code);
19469 compare_code = UNKNOWN;
19470 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
19471 && CONST_INT_P (op1))
19473 if (op1 == const0_rtx
19474 && (code == LT || code == GE))
19475 compare_code = code;
19476 else if (op1 == constm1_rtx)
19478 if (code == LE)
19479 compare_code = LT;
19480 else if (code == GT)
19481 compare_code = GE;
19485 /* Optimize dest = (op0 < 0) ? -1 : cf. */
19486 if (compare_code != UNKNOWN
19487 && GET_MODE (op0) == GET_MODE (out)
19488 && (cf == -1 || ct == -1))
19490 /* If lea code below could be used, only optimize
19491 if it results in a 2 insn sequence. */
19493 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
19494 || diff == 3 || diff == 5 || diff == 9)
19495 || (compare_code == LT && ct == -1)
19496 || (compare_code == GE && cf == -1))
19499 * notl op1 (if necessary)
19500 * sarl $31, op1
19501 * orl cf, op1
19503 if (ct != -1)
19505 cf = ct;
19506 ct = -1;
19507 code = reverse_condition (code);
19510 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19512 out = expand_simple_binop (mode, IOR,
19513 out, GEN_INT (cf),
19514 out, 1, OPTAB_DIRECT);
19515 if (out != operands[0])
19516 emit_move_insn (operands[0], out);
19518 return true;
19523 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
19524 || diff == 3 || diff == 5 || diff == 9)
19525 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
19526 && (mode != DImode
19527 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
19530 * xorl dest,dest
19531 * cmpl op1,op2
19532 * setcc dest
19533 * lea cf(dest*(ct-cf)),dest
19535 * Size 14.
19537 * This also catches the degenerate setcc-only case.
19540 rtx tmp;
19541 int nops;
19543 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19545 nops = 0;
19546 /* On x86_64 the lea instruction operates on Pmode, so we need
19547 to get arithmetics done in proper mode to match. */
19548 if (diff == 1)
19549 tmp = copy_rtx (out);
19550 else
19552 rtx out1;
19553 out1 = copy_rtx (out);
19554 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
19555 nops++;
19556 if (diff & 1)
19558 tmp = gen_rtx_PLUS (mode, tmp, out1);
19559 nops++;
19562 if (cf != 0)
19564 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
19565 nops++;
19567 if (!rtx_equal_p (tmp, out))
19569 if (nops == 1)
19570 out = force_operand (tmp, copy_rtx (out));
19571 else
19572 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
19574 if (!rtx_equal_p (out, operands[0]))
19575 emit_move_insn (operands[0], copy_rtx (out));
19577 return true;
19581 * General case: Jumpful:
19582 * xorl dest,dest cmpl op1, op2
19583 * cmpl op1, op2 movl ct, dest
19584 * setcc dest jcc 1f
19585 * decl dest movl cf, dest
19586 * andl (cf-ct),dest 1:
19587 * addl ct,dest
19589 * Size 20. Size 14.
19591 * This is reasonably steep, but branch mispredict costs are
19592 * high on modern cpus, so consider failing only if optimizing
19593 * for space.
19596 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19597 && BRANCH_COST (optimize_insn_for_speed_p (),
19598 false) >= 2)
19600 if (cf == 0)
19602 enum machine_mode cmp_mode = GET_MODE (op0);
19604 cf = ct;
19605 ct = 0;
19607 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19609 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19611 /* We may be reversing unordered compare to normal compare,
19612 that is not valid in general (we may convert non-trapping
19613 condition to trapping one), however on i386 we currently
19614 emit all comparisons unordered. */
19615 code = reverse_condition_maybe_unordered (code);
19617 else
19619 code = reverse_condition (code);
19620 if (compare_code != UNKNOWN)
19621 compare_code = reverse_condition (compare_code);
19625 if (compare_code != UNKNOWN)
19627 /* notl op1 (if needed)
19628 sarl $31, op1
19629 andl (cf-ct), op1
19630 addl ct, op1
19632 For x < 0 (resp. x <= -1) there will be no notl,
19633 so if possible swap the constants to get rid of the
19634 complement.
19635 True/false will be -1/0 while code below (store flag
19636 followed by decrement) is 0/-1, so the constants need
19637 to be exchanged once more. */
19639 if (compare_code == GE || !cf)
19641 code = reverse_condition (code);
19642 compare_code = LT;
19644 else
19646 HOST_WIDE_INT tmp = cf;
19647 cf = ct;
19648 ct = tmp;
19651 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19653 else
19655 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19657 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
19658 constm1_rtx,
19659 copy_rtx (out), 1, OPTAB_DIRECT);
19662 out = expand_simple_binop (mode, AND, copy_rtx (out),
19663 gen_int_mode (cf - ct, mode),
19664 copy_rtx (out), 1, OPTAB_DIRECT);
19665 if (ct)
19666 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
19667 copy_rtx (out), 1, OPTAB_DIRECT);
19668 if (!rtx_equal_p (out, operands[0]))
19669 emit_move_insn (operands[0], copy_rtx (out));
19671 return true;
19675 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19677 /* Try a few things more with specific constants and a variable. */
19679 optab op;
19680 rtx var, orig_out, out, tmp;
19682 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
19683 return false;
19685 /* If one of the two operands is an interesting constant, load a
19686 constant with the above and mask it in with a logical operation. */
19688 if (CONST_INT_P (operands[2]))
19690 var = operands[3];
19691 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
19692 operands[3] = constm1_rtx, op = and_optab;
19693 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
19694 operands[3] = const0_rtx, op = ior_optab;
19695 else
19696 return false;
19698 else if (CONST_INT_P (operands[3]))
19700 var = operands[2];
19701 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
19702 operands[2] = constm1_rtx, op = and_optab;
19703 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
19704 operands[2] = const0_rtx, op = ior_optab;
19705 else
19706 return false;
19708 else
19709 return false;
19711 orig_out = operands[0];
19712 tmp = gen_reg_rtx (mode);
19713 operands[0] = tmp;
19715 /* Recurse to get the constant loaded. */
19716 if (ix86_expand_int_movcc (operands) == 0)
19717 return false;
19719 /* Mask in the interesting variable. */
19720 out = expand_binop (mode, op, var, tmp, orig_out, 0,
19721 OPTAB_WIDEN);
19722 if (!rtx_equal_p (out, orig_out))
19723 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
19725 return true;
19729 * For comparison with above,
19731 * movl cf,dest
19732 * movl ct,tmp
19733 * cmpl op1,op2
19734 * cmovcc tmp,dest
19736 * Size 15.
19739 if (! nonimmediate_operand (operands[2], mode))
19740 operands[2] = force_reg (mode, operands[2]);
19741 if (! nonimmediate_operand (operands[3], mode))
19742 operands[3] = force_reg (mode, operands[3]);
19744 if (! register_operand (operands[2], VOIDmode)
19745 && (mode == QImode
19746 || ! register_operand (operands[3], VOIDmode)))
19747 operands[2] = force_reg (mode, operands[2]);
19749 if (mode == QImode
19750 && ! register_operand (operands[3], VOIDmode))
19751 operands[3] = force_reg (mode, operands[3]);
19753 emit_insn (compare_seq);
19754 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19755 gen_rtx_IF_THEN_ELSE (mode,
19756 compare_op, operands[2],
19757 operands[3])));
19758 return true;
19761 /* Swap, force into registers, or otherwise massage the two operands
19762 to an sse comparison with a mask result. Thus we differ a bit from
19763 ix86_prepare_fp_compare_args which expects to produce a flags result.
19765 The DEST operand exists to help determine whether to commute commutative
19766 operators. The POP0/POP1 operands are updated in place. The new
19767 comparison code is returned, or UNKNOWN if not implementable. */
19769 static enum rtx_code
19770 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
19771 rtx *pop0, rtx *pop1)
19773 rtx tmp;
19775 switch (code)
19777 case LTGT:
19778 case UNEQ:
19779 /* AVX supports all the needed comparisons. */
19780 if (TARGET_AVX)
19781 break;
19782 /* We have no LTGT as an operator. We could implement it with
19783 NE & ORDERED, but this requires an extra temporary. It's
19784 not clear that it's worth it. */
19785 return UNKNOWN;
19787 case LT:
19788 case LE:
19789 case UNGT:
19790 case UNGE:
19791 /* These are supported directly. */
19792 break;
19794 case EQ:
19795 case NE:
19796 case UNORDERED:
19797 case ORDERED:
19798 /* AVX has 3 operand comparisons, no need to swap anything. */
19799 if (TARGET_AVX)
19800 break;
19801 /* For commutative operators, try to canonicalize the destination
19802 operand to be first in the comparison - this helps reload to
19803 avoid extra moves. */
19804 if (!dest || !rtx_equal_p (dest, *pop1))
19805 break;
19806 /* FALLTHRU */
19808 case GE:
19809 case GT:
19810 case UNLE:
19811 case UNLT:
19812 /* These are not supported directly before AVX, and furthermore
19813 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
19814 comparison operands to transform into something that is
19815 supported. */
19816 tmp = *pop0;
19817 *pop0 = *pop1;
19818 *pop1 = tmp;
19819 code = swap_condition (code);
19820 break;
19822 default:
19823 gcc_unreachable ();
19826 return code;
19829 /* Detect conditional moves that exactly match min/max operational
19830 semantics. Note that this is IEEE safe, as long as we don't
19831 interchange the operands.
19833 Returns FALSE if this conditional move doesn't match a MIN/MAX,
19834 and TRUE if the operation is successful and instructions are emitted. */
19836 static bool
19837 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
19838 rtx cmp_op1, rtx if_true, rtx if_false)
19840 enum machine_mode mode;
19841 bool is_min;
19842 rtx tmp;
19844 if (code == LT)
19846 else if (code == UNGE)
19848 tmp = if_true;
19849 if_true = if_false;
19850 if_false = tmp;
19852 else
19853 return false;
19855 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
19856 is_min = true;
19857 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
19858 is_min = false;
19859 else
19860 return false;
19862 mode = GET_MODE (dest);
19864 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
19865 but MODE may be a vector mode and thus not appropriate. */
19866 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
19868 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
19869 rtvec v;
19871 if_true = force_reg (mode, if_true);
19872 v = gen_rtvec (2, if_true, if_false);
19873 tmp = gen_rtx_UNSPEC (mode, v, u);
19875 else
19877 code = is_min ? SMIN : SMAX;
19878 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
19881 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
19882 return true;
19885 /* Expand an sse vector comparison. Return the register with the result. */
19887 static rtx
19888 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
19889 rtx op_true, rtx op_false)
19891 enum machine_mode mode = GET_MODE (dest);
19892 enum machine_mode cmp_mode = GET_MODE (cmp_op0);
19893 rtx x;
19895 cmp_op0 = force_reg (cmp_mode, cmp_op0);
19896 if (!nonimmediate_operand (cmp_op1, cmp_mode))
19897 cmp_op1 = force_reg (cmp_mode, cmp_op1);
19899 if (optimize
19900 || reg_overlap_mentioned_p (dest, op_true)
19901 || reg_overlap_mentioned_p (dest, op_false))
19902 dest = gen_reg_rtx (mode);
19904 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
19905 if (cmp_mode != mode)
19907 x = force_reg (cmp_mode, x);
19908 convert_move (dest, x, false);
19910 else
19911 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19913 return dest;
19916 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
19917 operations. This is used for both scalar and vector conditional moves. */
19919 static void
19920 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
19922 enum machine_mode mode = GET_MODE (dest);
19923 rtx t2, t3, x;
19925 if (vector_all_ones_operand (op_true, mode)
19926 && rtx_equal_p (op_false, CONST0_RTX (mode)))
19928 emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
19930 else if (op_false == CONST0_RTX (mode))
19932 op_true = force_reg (mode, op_true);
19933 x = gen_rtx_AND (mode, cmp, op_true);
19934 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19936 else if (op_true == CONST0_RTX (mode))
19938 op_false = force_reg (mode, op_false);
19939 x = gen_rtx_NOT (mode, cmp);
19940 x = gen_rtx_AND (mode, x, op_false);
19941 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19943 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
19945 op_false = force_reg (mode, op_false);
19946 x = gen_rtx_IOR (mode, cmp, op_false);
19947 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19949 else if (TARGET_XOP)
19951 op_true = force_reg (mode, op_true);
19953 if (!nonimmediate_operand (op_false, mode))
19954 op_false = force_reg (mode, op_false);
19956 emit_insn (gen_rtx_SET (mode, dest,
19957 gen_rtx_IF_THEN_ELSE (mode, cmp,
19958 op_true,
19959 op_false)));
19961 else
19963 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
19965 if (!nonimmediate_operand (op_true, mode))
19966 op_true = force_reg (mode, op_true);
19968 op_false = force_reg (mode, op_false);
19970 switch (mode)
19972 case V4SFmode:
19973 if (TARGET_SSE4_1)
19974 gen = gen_sse4_1_blendvps;
19975 break;
19976 case V2DFmode:
19977 if (TARGET_SSE4_1)
19978 gen = gen_sse4_1_blendvpd;
19979 break;
19980 case V16QImode:
19981 case V8HImode:
19982 case V4SImode:
19983 case V2DImode:
19984 if (TARGET_SSE4_1)
19986 gen = gen_sse4_1_pblendvb;
19987 dest = gen_lowpart (V16QImode, dest);
19988 op_false = gen_lowpart (V16QImode, op_false);
19989 op_true = gen_lowpart (V16QImode, op_true);
19990 cmp = gen_lowpart (V16QImode, cmp);
19992 break;
19993 case V8SFmode:
19994 if (TARGET_AVX)
19995 gen = gen_avx_blendvps256;
19996 break;
19997 case V4DFmode:
19998 if (TARGET_AVX)
19999 gen = gen_avx_blendvpd256;
20000 break;
20001 case V32QImode:
20002 case V16HImode:
20003 case V8SImode:
20004 case V4DImode:
20005 if (TARGET_AVX2)
20007 gen = gen_avx2_pblendvb;
20008 dest = gen_lowpart (V32QImode, dest);
20009 op_false = gen_lowpart (V32QImode, op_false);
20010 op_true = gen_lowpart (V32QImode, op_true);
20011 cmp = gen_lowpart (V32QImode, cmp);
20013 break;
20014 default:
20015 break;
20018 if (gen != NULL)
20019 emit_insn (gen (dest, op_false, op_true, cmp));
20020 else
20022 op_true = force_reg (mode, op_true);
20024 t2 = gen_reg_rtx (mode);
20025 if (optimize)
20026 t3 = gen_reg_rtx (mode);
20027 else
20028 t3 = dest;
20030 x = gen_rtx_AND (mode, op_true, cmp);
20031 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
20033 x = gen_rtx_NOT (mode, cmp);
20034 x = gen_rtx_AND (mode, x, op_false);
20035 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
20037 x = gen_rtx_IOR (mode, t3, t2);
20038 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20043 /* Expand a floating-point conditional move. Return true if successful. */
20045 bool
20046 ix86_expand_fp_movcc (rtx operands[])
20048 enum machine_mode mode = GET_MODE (operands[0]);
20049 enum rtx_code code = GET_CODE (operands[1]);
20050 rtx tmp, compare_op;
20051 rtx op0 = XEXP (operands[1], 0);
20052 rtx op1 = XEXP (operands[1], 1);
20054 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
20056 enum machine_mode cmode;
20058 /* Since we've no cmove for sse registers, don't force bad register
20059 allocation just to gain access to it. Deny movcc when the
20060 comparison mode doesn't match the move mode. */
20061 cmode = GET_MODE (op0);
20062 if (cmode == VOIDmode)
20063 cmode = GET_MODE (op1);
20064 if (cmode != mode)
20065 return false;
20067 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
20068 if (code == UNKNOWN)
20069 return false;
20071 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
20072 operands[2], operands[3]))
20073 return true;
20075 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
20076 operands[2], operands[3]);
20077 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
20078 return true;
20081 if (GET_MODE (op0) == TImode
20082 || (GET_MODE (op0) == DImode
20083 && !TARGET_64BIT))
20084 return false;
20086 /* The floating point conditional move instructions don't directly
20087 support conditions resulting from a signed integer comparison. */
20089 compare_op = ix86_expand_compare (code, op0, op1);
20090 if (!fcmov_comparison_operator (compare_op, VOIDmode))
20092 tmp = gen_reg_rtx (QImode);
20093 ix86_expand_setcc (tmp, code, op0, op1);
20095 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
20098 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
20099 gen_rtx_IF_THEN_ELSE (mode, compare_op,
20100 operands[2], operands[3])));
20102 return true;
20105 /* Expand a floating-point vector conditional move; a vcond operation
20106 rather than a movcc operation. */
20108 bool
20109 ix86_expand_fp_vcond (rtx operands[])
20111 enum rtx_code code = GET_CODE (operands[3]);
20112 rtx cmp;
20114 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
20115 &operands[4], &operands[5]);
20116 if (code == UNKNOWN)
20118 rtx temp;
20119 switch (GET_CODE (operands[3]))
20121 case LTGT:
20122 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
20123 operands[5], operands[0], operands[0]);
20124 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
20125 operands[5], operands[1], operands[2]);
20126 code = AND;
20127 break;
20128 case UNEQ:
20129 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
20130 operands[5], operands[0], operands[0]);
20131 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
20132 operands[5], operands[1], operands[2]);
20133 code = IOR;
20134 break;
20135 default:
20136 gcc_unreachable ();
20138 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
20139 OPTAB_DIRECT);
20140 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20141 return true;
20144 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
20145 operands[5], operands[1], operands[2]))
20146 return true;
20148 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
20149 operands[1], operands[2]);
20150 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20151 return true;
20154 /* Expand a signed/unsigned integral vector conditional move. */
20156 bool
20157 ix86_expand_int_vcond (rtx operands[])
20159 enum machine_mode data_mode = GET_MODE (operands[0]);
20160 enum machine_mode mode = GET_MODE (operands[4]);
20161 enum rtx_code code = GET_CODE (operands[3]);
20162 bool negate = false;
20163 rtx x, cop0, cop1;
20165 cop0 = operands[4];
20166 cop1 = operands[5];
20168 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
20169 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
20170 if ((code == LT || code == GE)
20171 && data_mode == mode
20172 && cop1 == CONST0_RTX (mode)
20173 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
20174 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
20175 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
20176 && (GET_MODE_SIZE (data_mode) == 16
20177 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
20179 rtx negop = operands[2 - (code == LT)];
20180 int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
20181 if (negop == CONST1_RTX (data_mode))
20183 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
20184 operands[0], 1, OPTAB_DIRECT);
20185 if (res != operands[0])
20186 emit_move_insn (operands[0], res);
20187 return true;
20189 else if (GET_MODE_INNER (data_mode) != DImode
20190 && vector_all_ones_operand (negop, data_mode))
20192 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
20193 operands[0], 0, OPTAB_DIRECT);
20194 if (res != operands[0])
20195 emit_move_insn (operands[0], res);
20196 return true;
20200 if (!nonimmediate_operand (cop1, mode))
20201 cop1 = force_reg (mode, cop1);
20202 if (!general_operand (operands[1], data_mode))
20203 operands[1] = force_reg (data_mode, operands[1]);
20204 if (!general_operand (operands[2], data_mode))
20205 operands[2] = force_reg (data_mode, operands[2]);
20207 /* XOP supports all of the comparisons on all 128-bit vector int types. */
20208 if (TARGET_XOP
20209 && (mode == V16QImode || mode == V8HImode
20210 || mode == V4SImode || mode == V2DImode))
20212 else
20214 /* Canonicalize the comparison to EQ, GT, GTU. */
20215 switch (code)
20217 case EQ:
20218 case GT:
20219 case GTU:
20220 break;
20222 case NE:
20223 case LE:
20224 case LEU:
20225 code = reverse_condition (code);
20226 negate = true;
20227 break;
20229 case GE:
20230 case GEU:
20231 code = reverse_condition (code);
20232 negate = true;
20233 /* FALLTHRU */
20235 case LT:
20236 case LTU:
20237 code = swap_condition (code);
20238 x = cop0, cop0 = cop1, cop1 = x;
20239 break;
20241 default:
20242 gcc_unreachable ();
20245 /* Only SSE4.1/SSE4.2 supports V2DImode. */
20246 if (mode == V2DImode)
20248 switch (code)
20250 case EQ:
20251 /* SSE4.1 supports EQ. */
20252 if (!TARGET_SSE4_1)
20253 return false;
20254 break;
20256 case GT:
20257 case GTU:
20258 /* SSE4.2 supports GT/GTU. */
20259 if (!TARGET_SSE4_2)
20260 return false;
20261 break;
20263 default:
20264 gcc_unreachable ();
20268 /* Unsigned parallel compare is not supported by the hardware.
20269 Play some tricks to turn this into a signed comparison
20270 against 0. */
20271 if (code == GTU)
20273 cop0 = force_reg (mode, cop0);
20275 switch (mode)
20277 case V8SImode:
20278 case V4DImode:
20279 case V4SImode:
20280 case V2DImode:
20282 rtx t1, t2, mask;
20283 rtx (*gen_sub3) (rtx, rtx, rtx);
20285 switch (mode)
20287 case V8SImode: gen_sub3 = gen_subv8si3; break;
20288 case V4DImode: gen_sub3 = gen_subv4di3; break;
20289 case V4SImode: gen_sub3 = gen_subv4si3; break;
20290 case V2DImode: gen_sub3 = gen_subv2di3; break;
20291 default:
20292 gcc_unreachable ();
20294 /* Subtract (-(INT MAX) - 1) from both operands to make
20295 them signed. */
20296 mask = ix86_build_signbit_mask (mode, true, false);
20297 t1 = gen_reg_rtx (mode);
20298 emit_insn (gen_sub3 (t1, cop0, mask));
20300 t2 = gen_reg_rtx (mode);
20301 emit_insn (gen_sub3 (t2, cop1, mask));
20303 cop0 = t1;
20304 cop1 = t2;
20305 code = GT;
20307 break;
20309 case V32QImode:
20310 case V16HImode:
20311 case V16QImode:
20312 case V8HImode:
20313 /* Perform a parallel unsigned saturating subtraction. */
20314 x = gen_reg_rtx (mode);
20315 emit_insn (gen_rtx_SET (VOIDmode, x,
20316 gen_rtx_US_MINUS (mode, cop0, cop1)));
20318 cop0 = x;
20319 cop1 = CONST0_RTX (mode);
20320 code = EQ;
20321 negate = !negate;
20322 break;
20324 default:
20325 gcc_unreachable ();
20330 /* Allow the comparison to be done in one mode, but the movcc to
20331 happen in another mode. */
20332 if (data_mode == mode)
20334 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
20335 operands[1+negate], operands[2-negate]);
20337 else
20339 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
20340 x = ix86_expand_sse_cmp (gen_lowpart (mode, operands[0]),
20341 code, cop0, cop1,
20342 operands[1+negate], operands[2-negate]);
20343 x = gen_lowpart (data_mode, x);
20346 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
20347 operands[2-negate]);
20348 return true;
20351 /* Expand a variable vector permutation. */
20353 void
20354 ix86_expand_vec_perm (rtx operands[])
20356 rtx target = operands[0];
20357 rtx op0 = operands[1];
20358 rtx op1 = operands[2];
20359 rtx mask = operands[3];
20360 rtx t1, t2, t3, t4, vt, vt2, vec[32];
20361 enum machine_mode mode = GET_MODE (op0);
20362 enum machine_mode maskmode = GET_MODE (mask);
20363 int w, e, i;
20364 bool one_operand_shuffle = rtx_equal_p (op0, op1);
20366 /* Number of elements in the vector. */
20367 w = GET_MODE_NUNITS (mode);
20368 e = GET_MODE_UNIT_SIZE (mode);
20369 gcc_assert (w <= 32);
20371 if (TARGET_AVX2)
20373 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
20375 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
20376 an constant shuffle operand. With a tiny bit of effort we can
20377 use VPERMD instead. A re-interpretation stall for V4DFmode is
20378 unfortunate but there's no avoiding it.
20379 Similarly for V16HImode we don't have instructions for variable
20380 shuffling, while for V32QImode we can use after preparing suitable
20381 masks vpshufb; vpshufb; vpermq; vpor. */
20383 if (mode == V16HImode)
20385 maskmode = mode = V32QImode;
20386 w = 32;
20387 e = 1;
20389 else
20391 maskmode = mode = V8SImode;
20392 w = 8;
20393 e = 4;
20395 t1 = gen_reg_rtx (maskmode);
20397 /* Replicate the low bits of the V4DImode mask into V8SImode:
20398 mask = { A B C D }
20399 t1 = { A A B B C C D D }. */
20400 for (i = 0; i < w / 2; ++i)
20401 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
20402 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20403 vt = force_reg (maskmode, vt);
20404 mask = gen_lowpart (maskmode, mask);
20405 if (maskmode == V8SImode)
20406 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
20407 else
20408 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
20410 /* Multiply the shuffle indicies by two. */
20411 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
20412 OPTAB_DIRECT);
20414 /* Add one to the odd shuffle indicies:
20415 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
20416 for (i = 0; i < w / 2; ++i)
20418 vec[i * 2] = const0_rtx;
20419 vec[i * 2 + 1] = const1_rtx;
20421 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20422 vt = force_const_mem (maskmode, vt);
20423 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
20424 OPTAB_DIRECT);
20426 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
20427 operands[3] = mask = t1;
20428 target = gen_lowpart (mode, target);
20429 op0 = gen_lowpart (mode, op0);
20430 op1 = gen_lowpart (mode, op1);
20433 switch (mode)
20435 case V8SImode:
20436 /* The VPERMD and VPERMPS instructions already properly ignore
20437 the high bits of the shuffle elements. No need for us to
20438 perform an AND ourselves. */
20439 if (one_operand_shuffle)
20440 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
20441 else
20443 t1 = gen_reg_rtx (V8SImode);
20444 t2 = gen_reg_rtx (V8SImode);
20445 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
20446 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
20447 goto merge_two;
20449 return;
20451 case V8SFmode:
20452 mask = gen_lowpart (V8SFmode, mask);
20453 if (one_operand_shuffle)
20454 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
20455 else
20457 t1 = gen_reg_rtx (V8SFmode);
20458 t2 = gen_reg_rtx (V8SFmode);
20459 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
20460 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
20461 goto merge_two;
20463 return;
20465 case V4SImode:
20466 /* By combining the two 128-bit input vectors into one 256-bit
20467 input vector, we can use VPERMD and VPERMPS for the full
20468 two-operand shuffle. */
20469 t1 = gen_reg_rtx (V8SImode);
20470 t2 = gen_reg_rtx (V8SImode);
20471 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
20472 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
20473 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
20474 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
20475 return;
20477 case V4SFmode:
20478 t1 = gen_reg_rtx (V8SFmode);
20479 t2 = gen_reg_rtx (V8SImode);
20480 mask = gen_lowpart (V4SImode, mask);
20481 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
20482 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
20483 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
20484 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
20485 return;
20487 case V32QImode:
20488 t1 = gen_reg_rtx (V32QImode);
20489 t2 = gen_reg_rtx (V32QImode);
20490 t3 = gen_reg_rtx (V32QImode);
20491 vt2 = GEN_INT (128);
20492 for (i = 0; i < 32; i++)
20493 vec[i] = vt2;
20494 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20495 vt = force_reg (V32QImode, vt);
20496 for (i = 0; i < 32; i++)
20497 vec[i] = i < 16 ? vt2 : const0_rtx;
20498 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20499 vt2 = force_reg (V32QImode, vt2);
20500 /* From mask create two adjusted masks, which contain the same
20501 bits as mask in the low 7 bits of each vector element.
20502 The first mask will have the most significant bit clear
20503 if it requests element from the same 128-bit lane
20504 and MSB set if it requests element from the other 128-bit lane.
20505 The second mask will have the opposite values of the MSB,
20506 and additionally will have its 128-bit lanes swapped.
20507 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
20508 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
20509 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
20510 stands for other 12 bytes. */
20511 /* The bit whether element is from the same lane or the other
20512 lane is bit 4, so shift it up by 3 to the MSB position. */
20513 emit_insn (gen_ashlv4di3 (gen_lowpart (V4DImode, t1),
20514 gen_lowpart (V4DImode, mask),
20515 GEN_INT (3)));
20516 /* Clear MSB bits from the mask just in case it had them set. */
20517 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
20518 /* After this t1 will have MSB set for elements from other lane. */
20519 emit_insn (gen_xorv32qi3 (t1, t1, vt2));
20520 /* Clear bits other than MSB. */
20521 emit_insn (gen_andv32qi3 (t1, t1, vt));
20522 /* Or in the lower bits from mask into t3. */
20523 emit_insn (gen_iorv32qi3 (t3, t1, t2));
20524 /* And invert MSB bits in t1, so MSB is set for elements from the same
20525 lane. */
20526 emit_insn (gen_xorv32qi3 (t1, t1, vt));
20527 /* Swap 128-bit lanes in t3. */
20528 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20529 gen_lowpart (V4DImode, t3),
20530 const2_rtx, GEN_INT (3),
20531 const0_rtx, const1_rtx));
20532 /* And or in the lower bits from mask into t1. */
20533 emit_insn (gen_iorv32qi3 (t1, t1, t2));
20534 if (one_operand_shuffle)
20536 /* Each of these shuffles will put 0s in places where
20537 element from the other 128-bit lane is needed, otherwise
20538 will shuffle in the requested value. */
20539 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3));
20540 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
20541 /* For t3 the 128-bit lanes are swapped again. */
20542 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20543 gen_lowpart (V4DImode, t3),
20544 const2_rtx, GEN_INT (3),
20545 const0_rtx, const1_rtx));
20546 /* And oring both together leads to the result. */
20547 emit_insn (gen_iorv32qi3 (target, t1, t3));
20548 return;
20551 t4 = gen_reg_rtx (V32QImode);
20552 /* Similarly to the above one_operand_shuffle code,
20553 just for repeated twice for each operand. merge_two:
20554 code will merge the two results together. */
20555 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3));
20556 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3));
20557 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
20558 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
20559 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4),
20560 gen_lowpart (V4DImode, t4),
20561 const2_rtx, GEN_INT (3),
20562 const0_rtx, const1_rtx));
20563 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20564 gen_lowpart (V4DImode, t3),
20565 const2_rtx, GEN_INT (3),
20566 const0_rtx, const1_rtx));
20567 emit_insn (gen_iorv32qi3 (t4, t2, t4));
20568 emit_insn (gen_iorv32qi3 (t3, t1, t3));
20569 t1 = t4;
20570 t2 = t3;
20571 goto merge_two;
20573 default:
20574 gcc_assert (GET_MODE_SIZE (mode) <= 16);
20575 break;
20579 if (TARGET_XOP)
20581 /* The XOP VPPERM insn supports three inputs. By ignoring the
20582 one_operand_shuffle special case, we avoid creating another
20583 set of constant vectors in memory. */
20584 one_operand_shuffle = false;
20586 /* mask = mask & {2*w-1, ...} */
20587 vt = GEN_INT (2*w - 1);
20589 else
20591 /* mask = mask & {w-1, ...} */
20592 vt = GEN_INT (w - 1);
20595 for (i = 0; i < w; i++)
20596 vec[i] = vt;
20597 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20598 mask = expand_simple_binop (maskmode, AND, mask, vt,
20599 NULL_RTX, 0, OPTAB_DIRECT);
20601 /* For non-QImode operations, convert the word permutation control
20602 into a byte permutation control. */
20603 if (mode != V16QImode)
20605 mask = expand_simple_binop (maskmode, ASHIFT, mask,
20606 GEN_INT (exact_log2 (e)),
20607 NULL_RTX, 0, OPTAB_DIRECT);
20609 /* Convert mask to vector of chars. */
20610 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
20612 /* Replicate each of the input bytes into byte positions:
20613 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
20614 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
20615 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
20616 for (i = 0; i < 16; ++i)
20617 vec[i] = GEN_INT (i/e * e);
20618 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20619 vt = force_const_mem (V16QImode, vt);
20620 if (TARGET_XOP)
20621 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
20622 else
20623 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
20625 /* Convert it into the byte positions by doing
20626 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
20627 for (i = 0; i < 16; ++i)
20628 vec[i] = GEN_INT (i % e);
20629 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20630 vt = force_const_mem (V16QImode, vt);
20631 emit_insn (gen_addv16qi3 (mask, mask, vt));
20634 /* The actual shuffle operations all operate on V16QImode. */
20635 op0 = gen_lowpart (V16QImode, op0);
20636 op1 = gen_lowpart (V16QImode, op1);
20637 target = gen_lowpart (V16QImode, target);
20639 if (TARGET_XOP)
20641 emit_insn (gen_xop_pperm (target, op0, op1, mask));
20643 else if (one_operand_shuffle)
20645 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
20647 else
20649 rtx xops[6];
20650 bool ok;
20652 /* Shuffle the two input vectors independently. */
20653 t1 = gen_reg_rtx (V16QImode);
20654 t2 = gen_reg_rtx (V16QImode);
20655 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
20656 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
20658 merge_two:
20659 /* Then merge them together. The key is whether any given control
20660 element contained a bit set that indicates the second word. */
20661 mask = operands[3];
20662 vt = GEN_INT (w);
20663 if (maskmode == V2DImode && !TARGET_SSE4_1)
20665 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
20666 more shuffle to convert the V2DI input mask into a V4SI
20667 input mask. At which point the masking that expand_int_vcond
20668 will work as desired. */
20669 rtx t3 = gen_reg_rtx (V4SImode);
20670 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
20671 const0_rtx, const0_rtx,
20672 const2_rtx, const2_rtx));
20673 mask = t3;
20674 maskmode = V4SImode;
20675 e = w = 4;
20678 for (i = 0; i < w; i++)
20679 vec[i] = vt;
20680 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20681 vt = force_reg (maskmode, vt);
20682 mask = expand_simple_binop (maskmode, AND, mask, vt,
20683 NULL_RTX, 0, OPTAB_DIRECT);
20685 xops[0] = gen_lowpart (mode, operands[0]);
20686 xops[1] = gen_lowpart (mode, t2);
20687 xops[2] = gen_lowpart (mode, t1);
20688 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
20689 xops[4] = mask;
20690 xops[5] = vt;
20691 ok = ix86_expand_int_vcond (xops);
20692 gcc_assert (ok);
20696 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
20697 true if we should do zero extension, else sign extension. HIGH_P is
20698 true if we want the N/2 high elements, else the low elements. */
20700 void
20701 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
20703 enum machine_mode imode = GET_MODE (src);
20704 rtx tmp;
20706 if (TARGET_SSE4_1)
20708 rtx (*unpack)(rtx, rtx);
20709 rtx (*extract)(rtx, rtx) = NULL;
20710 enum machine_mode halfmode = BLKmode;
20712 switch (imode)
20714 case V32QImode:
20715 if (unsigned_p)
20716 unpack = gen_avx2_zero_extendv16qiv16hi2;
20717 else
20718 unpack = gen_avx2_sign_extendv16qiv16hi2;
20719 halfmode = V16QImode;
20720 extract
20721 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
20722 break;
20723 case V16HImode:
20724 if (unsigned_p)
20725 unpack = gen_avx2_zero_extendv8hiv8si2;
20726 else
20727 unpack = gen_avx2_sign_extendv8hiv8si2;
20728 halfmode = V8HImode;
20729 extract
20730 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
20731 break;
20732 case V8SImode:
20733 if (unsigned_p)
20734 unpack = gen_avx2_zero_extendv4siv4di2;
20735 else
20736 unpack = gen_avx2_sign_extendv4siv4di2;
20737 halfmode = V4SImode;
20738 extract
20739 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
20740 break;
20741 case V16QImode:
20742 if (unsigned_p)
20743 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
20744 else
20745 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
20746 break;
20747 case V8HImode:
20748 if (unsigned_p)
20749 unpack = gen_sse4_1_zero_extendv4hiv4si2;
20750 else
20751 unpack = gen_sse4_1_sign_extendv4hiv4si2;
20752 break;
20753 case V4SImode:
20754 if (unsigned_p)
20755 unpack = gen_sse4_1_zero_extendv2siv2di2;
20756 else
20757 unpack = gen_sse4_1_sign_extendv2siv2di2;
20758 break;
20759 default:
20760 gcc_unreachable ();
20763 if (GET_MODE_SIZE (imode) == 32)
20765 tmp = gen_reg_rtx (halfmode);
20766 emit_insn (extract (tmp, src));
20768 else if (high_p)
20770 /* Shift higher 8 bytes to lower 8 bytes. */
20771 tmp = gen_reg_rtx (imode);
20772 emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp),
20773 gen_lowpart (V1TImode, src),
20774 GEN_INT (64)));
20776 else
20777 tmp = src;
20779 emit_insn (unpack (dest, tmp));
20781 else
20783 rtx (*unpack)(rtx, rtx, rtx);
20785 switch (imode)
20787 case V16QImode:
20788 if (high_p)
20789 unpack = gen_vec_interleave_highv16qi;
20790 else
20791 unpack = gen_vec_interleave_lowv16qi;
20792 break;
20793 case V8HImode:
20794 if (high_p)
20795 unpack = gen_vec_interleave_highv8hi;
20796 else
20797 unpack = gen_vec_interleave_lowv8hi;
20798 break;
20799 case V4SImode:
20800 if (high_p)
20801 unpack = gen_vec_interleave_highv4si;
20802 else
20803 unpack = gen_vec_interleave_lowv4si;
20804 break;
20805 default:
20806 gcc_unreachable ();
20809 if (unsigned_p)
20810 tmp = force_reg (imode, CONST0_RTX (imode));
20811 else
20812 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
20813 src, pc_rtx, pc_rtx);
20815 emit_insn (unpack (gen_lowpart (imode, dest), src, tmp));
20819 /* Expand conditional increment or decrement using adb/sbb instructions.
20820 The default case using setcc followed by the conditional move can be
20821 done by generic code. */
20822 bool
20823 ix86_expand_int_addcc (rtx operands[])
20825 enum rtx_code code = GET_CODE (operands[1]);
20826 rtx flags;
20827 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
20828 rtx compare_op;
20829 rtx val = const0_rtx;
20830 bool fpcmp = false;
20831 enum machine_mode mode;
20832 rtx op0 = XEXP (operands[1], 0);
20833 rtx op1 = XEXP (operands[1], 1);
20835 if (operands[3] != const1_rtx
20836 && operands[3] != constm1_rtx)
20837 return false;
20838 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
20839 return false;
20840 code = GET_CODE (compare_op);
20842 flags = XEXP (compare_op, 0);
20844 if (GET_MODE (flags) == CCFPmode
20845 || GET_MODE (flags) == CCFPUmode)
20847 fpcmp = true;
20848 code = ix86_fp_compare_code_to_integer (code);
20851 if (code != LTU)
20853 val = constm1_rtx;
20854 if (fpcmp)
20855 PUT_CODE (compare_op,
20856 reverse_condition_maybe_unordered
20857 (GET_CODE (compare_op)));
20858 else
20859 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
20862 mode = GET_MODE (operands[0]);
20864 /* Construct either adc or sbb insn. */
20865 if ((code == LTU) == (operands[3] == constm1_rtx))
20867 switch (mode)
20869 case QImode:
20870 insn = gen_subqi3_carry;
20871 break;
20872 case HImode:
20873 insn = gen_subhi3_carry;
20874 break;
20875 case SImode:
20876 insn = gen_subsi3_carry;
20877 break;
20878 case DImode:
20879 insn = gen_subdi3_carry;
20880 break;
20881 default:
20882 gcc_unreachable ();
20885 else
20887 switch (mode)
20889 case QImode:
20890 insn = gen_addqi3_carry;
20891 break;
20892 case HImode:
20893 insn = gen_addhi3_carry;
20894 break;
20895 case SImode:
20896 insn = gen_addsi3_carry;
20897 break;
20898 case DImode:
20899 insn = gen_adddi3_carry;
20900 break;
20901 default:
20902 gcc_unreachable ();
20905 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
20907 return true;
20911 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
20912 but works for floating pointer parameters and nonoffsetable memories.
20913 For pushes, it returns just stack offsets; the values will be saved
20914 in the right order. Maximally three parts are generated. */
20916 static int
20917 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
20919 int size;
20921 if (!TARGET_64BIT)
20922 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
20923 else
20924 size = (GET_MODE_SIZE (mode) + 4) / 8;
20926 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
20927 gcc_assert (size >= 2 && size <= 4);
20929 /* Optimize constant pool reference to immediates. This is used by fp
20930 moves, that force all constants to memory to allow combining. */
20931 if (MEM_P (operand) && MEM_READONLY_P (operand))
20933 rtx tmp = maybe_get_pool_constant (operand);
20934 if (tmp)
20935 operand = tmp;
20938 if (MEM_P (operand) && !offsettable_memref_p (operand))
20940 /* The only non-offsetable memories we handle are pushes. */
20941 int ok = push_operand (operand, VOIDmode);
20943 gcc_assert (ok);
20945 operand = copy_rtx (operand);
20946 PUT_MODE (operand, word_mode);
20947 parts[0] = parts[1] = parts[2] = parts[3] = operand;
20948 return size;
20951 if (GET_CODE (operand) == CONST_VECTOR)
20953 enum machine_mode imode = int_mode_for_mode (mode);
20954 /* Caution: if we looked through a constant pool memory above,
20955 the operand may actually have a different mode now. That's
20956 ok, since we want to pun this all the way back to an integer. */
20957 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
20958 gcc_assert (operand != NULL);
20959 mode = imode;
20962 if (!TARGET_64BIT)
20964 if (mode == DImode)
20965 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20966 else
20968 int i;
20970 if (REG_P (operand))
20972 gcc_assert (reload_completed);
20973 for (i = 0; i < size; i++)
20974 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
20976 else if (offsettable_memref_p (operand))
20978 operand = adjust_address (operand, SImode, 0);
20979 parts[0] = operand;
20980 for (i = 1; i < size; i++)
20981 parts[i] = adjust_address (operand, SImode, 4 * i);
20983 else if (GET_CODE (operand) == CONST_DOUBLE)
20985 REAL_VALUE_TYPE r;
20986 long l[4];
20988 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20989 switch (mode)
20991 case TFmode:
20992 real_to_target (l, &r, mode);
20993 parts[3] = gen_int_mode (l[3], SImode);
20994 parts[2] = gen_int_mode (l[2], SImode);
20995 break;
20996 case XFmode:
20997 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
20998 long double may not be 80-bit. */
20999 real_to_target (l, &r, mode);
21000 parts[2] = gen_int_mode (l[2], SImode);
21001 break;
21002 case DFmode:
21003 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
21004 break;
21005 default:
21006 gcc_unreachable ();
21008 parts[1] = gen_int_mode (l[1], SImode);
21009 parts[0] = gen_int_mode (l[0], SImode);
21011 else
21012 gcc_unreachable ();
21015 else
21017 if (mode == TImode)
21018 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
21019 if (mode == XFmode || mode == TFmode)
21021 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
21022 if (REG_P (operand))
21024 gcc_assert (reload_completed);
21025 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
21026 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
21028 else if (offsettable_memref_p (operand))
21030 operand = adjust_address (operand, DImode, 0);
21031 parts[0] = operand;
21032 parts[1] = adjust_address (operand, upper_mode, 8);
21034 else if (GET_CODE (operand) == CONST_DOUBLE)
21036 REAL_VALUE_TYPE r;
21037 long l[4];
21039 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
21040 real_to_target (l, &r, mode);
21042 /* Do not use shift by 32 to avoid warning on 32bit systems. */
21043 if (HOST_BITS_PER_WIDE_INT >= 64)
21044 parts[0]
21045 = gen_int_mode
21046 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
21047 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
21048 DImode);
21049 else
21050 parts[0] = immed_double_const (l[0], l[1], DImode);
21052 if (upper_mode == SImode)
21053 parts[1] = gen_int_mode (l[2], SImode);
21054 else if (HOST_BITS_PER_WIDE_INT >= 64)
21055 parts[1]
21056 = gen_int_mode
21057 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
21058 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
21059 DImode);
21060 else
21061 parts[1] = immed_double_const (l[2], l[3], DImode);
21063 else
21064 gcc_unreachable ();
21068 return size;
21071 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
21072 Return false when normal moves are needed; true when all required
21073 insns have been emitted. Operands 2-4 contain the input values
21074 int the correct order; operands 5-7 contain the output values. */
21076 void
21077 ix86_split_long_move (rtx operands[])
21079 rtx part[2][4];
21080 int nparts, i, j;
21081 int push = 0;
21082 int collisions = 0;
21083 enum machine_mode mode = GET_MODE (operands[0]);
21084 bool collisionparts[4];
21086 /* The DFmode expanders may ask us to move double.
21087 For 64bit target this is single move. By hiding the fact
21088 here we simplify i386.md splitters. */
21089 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
21091 /* Optimize constant pool reference to immediates. This is used by
21092 fp moves, that force all constants to memory to allow combining. */
21094 if (MEM_P (operands[1])
21095 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
21096 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
21097 operands[1] = get_pool_constant (XEXP (operands[1], 0));
21098 if (push_operand (operands[0], VOIDmode))
21100 operands[0] = copy_rtx (operands[0]);
21101 PUT_MODE (operands[0], word_mode);
21103 else
21104 operands[0] = gen_lowpart (DImode, operands[0]);
21105 operands[1] = gen_lowpart (DImode, operands[1]);
21106 emit_move_insn (operands[0], operands[1]);
21107 return;
21110 /* The only non-offsettable memory we handle is push. */
21111 if (push_operand (operands[0], VOIDmode))
21112 push = 1;
21113 else
21114 gcc_assert (!MEM_P (operands[0])
21115 || offsettable_memref_p (operands[0]));
21117 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
21118 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
21120 /* When emitting push, take care for source operands on the stack. */
21121 if (push && MEM_P (operands[1])
21122 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
21124 rtx src_base = XEXP (part[1][nparts - 1], 0);
21126 /* Compensate for the stack decrement by 4. */
21127 if (!TARGET_64BIT && nparts == 3
21128 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
21129 src_base = plus_constant (Pmode, src_base, 4);
21131 /* src_base refers to the stack pointer and is
21132 automatically decreased by emitted push. */
21133 for (i = 0; i < nparts; i++)
21134 part[1][i] = change_address (part[1][i],
21135 GET_MODE (part[1][i]), src_base);
21138 /* We need to do copy in the right order in case an address register
21139 of the source overlaps the destination. */
21140 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
21142 rtx tmp;
21144 for (i = 0; i < nparts; i++)
21146 collisionparts[i]
21147 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
21148 if (collisionparts[i])
21149 collisions++;
21152 /* Collision in the middle part can be handled by reordering. */
21153 if (collisions == 1 && nparts == 3 && collisionparts [1])
21155 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21156 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21158 else if (collisions == 1
21159 && nparts == 4
21160 && (collisionparts [1] || collisionparts [2]))
21162 if (collisionparts [1])
21164 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21165 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21167 else
21169 tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
21170 tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
21174 /* If there are more collisions, we can't handle it by reordering.
21175 Do an lea to the last part and use only one colliding move. */
21176 else if (collisions > 1)
21178 rtx base;
21180 collisions = 1;
21182 base = part[0][nparts - 1];
21184 /* Handle the case when the last part isn't valid for lea.
21185 Happens in 64-bit mode storing the 12-byte XFmode. */
21186 if (GET_MODE (base) != Pmode)
21187 base = gen_rtx_REG (Pmode, REGNO (base));
21189 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
21190 part[1][0] = replace_equiv_address (part[1][0], base);
21191 for (i = 1; i < nparts; i++)
21193 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
21194 part[1][i] = replace_equiv_address (part[1][i], tmp);
21199 if (push)
21201 if (!TARGET_64BIT)
21203 if (nparts == 3)
21205 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
21206 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
21207 stack_pointer_rtx, GEN_INT (-4)));
21208 emit_move_insn (part[0][2], part[1][2]);
21210 else if (nparts == 4)
21212 emit_move_insn (part[0][3], part[1][3]);
21213 emit_move_insn (part[0][2], part[1][2]);
21216 else
21218 /* In 64bit mode we don't have 32bit push available. In case this is
21219 register, it is OK - we will just use larger counterpart. We also
21220 retype memory - these comes from attempt to avoid REX prefix on
21221 moving of second half of TFmode value. */
21222 if (GET_MODE (part[1][1]) == SImode)
21224 switch (GET_CODE (part[1][1]))
21226 case MEM:
21227 part[1][1] = adjust_address (part[1][1], DImode, 0);
21228 break;
21230 case REG:
21231 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
21232 break;
21234 default:
21235 gcc_unreachable ();
21238 if (GET_MODE (part[1][0]) == SImode)
21239 part[1][0] = part[1][1];
21242 emit_move_insn (part[0][1], part[1][1]);
21243 emit_move_insn (part[0][0], part[1][0]);
21244 return;
21247 /* Choose correct order to not overwrite the source before it is copied. */
21248 if ((REG_P (part[0][0])
21249 && REG_P (part[1][1])
21250 && (REGNO (part[0][0]) == REGNO (part[1][1])
21251 || (nparts == 3
21252 && REGNO (part[0][0]) == REGNO (part[1][2]))
21253 || (nparts == 4
21254 && REGNO (part[0][0]) == REGNO (part[1][3]))))
21255 || (collisions > 0
21256 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
21258 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
21260 operands[2 + i] = part[0][j];
21261 operands[6 + i] = part[1][j];
21264 else
21266 for (i = 0; i < nparts; i++)
21268 operands[2 + i] = part[0][i];
21269 operands[6 + i] = part[1][i];
21273 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
21274 if (optimize_insn_for_size_p ())
21276 for (j = 0; j < nparts - 1; j++)
21277 if (CONST_INT_P (operands[6 + j])
21278 && operands[6 + j] != const0_rtx
21279 && REG_P (operands[2 + j]))
21280 for (i = j; i < nparts - 1; i++)
21281 if (CONST_INT_P (operands[7 + i])
21282 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
21283 operands[7 + i] = operands[2 + j];
21286 for (i = 0; i < nparts; i++)
21287 emit_move_insn (operands[2 + i], operands[6 + i]);
21289 return;
21292 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
21293 left shift by a constant, either using a single shift or
21294 a sequence of add instructions. */
21296 static void
21297 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
21299 rtx (*insn)(rtx, rtx, rtx);
21301 if (count == 1
21302 || (count * ix86_cost->add <= ix86_cost->shift_const
21303 && !optimize_insn_for_size_p ()))
21305 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
21306 while (count-- > 0)
21307 emit_insn (insn (operand, operand, operand));
21309 else
21311 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21312 emit_insn (insn (operand, operand, GEN_INT (count)));
21316 void
21317 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
21319 rtx (*gen_ashl3)(rtx, rtx, rtx);
21320 rtx (*gen_shld)(rtx, rtx, rtx);
21321 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21323 rtx low[2], high[2];
21324 int count;
21326 if (CONST_INT_P (operands[2]))
21328 split_double_mode (mode, operands, 2, low, high);
21329 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21331 if (count >= half_width)
21333 emit_move_insn (high[0], low[1]);
21334 emit_move_insn (low[0], const0_rtx);
21336 if (count > half_width)
21337 ix86_expand_ashl_const (high[0], count - half_width, mode);
21339 else
21341 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21343 if (!rtx_equal_p (operands[0], operands[1]))
21344 emit_move_insn (operands[0], operands[1]);
21346 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
21347 ix86_expand_ashl_const (low[0], count, mode);
21349 return;
21352 split_double_mode (mode, operands, 1, low, high);
21354 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21356 if (operands[1] == const1_rtx)
21358 /* Assuming we've chosen a QImode capable registers, then 1 << N
21359 can be done with two 32/64-bit shifts, no branches, no cmoves. */
21360 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
21362 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
21364 ix86_expand_clear (low[0]);
21365 ix86_expand_clear (high[0]);
21366 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
21368 d = gen_lowpart (QImode, low[0]);
21369 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21370 s = gen_rtx_EQ (QImode, flags, const0_rtx);
21371 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21373 d = gen_lowpart (QImode, high[0]);
21374 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21375 s = gen_rtx_NE (QImode, flags, const0_rtx);
21376 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21379 /* Otherwise, we can get the same results by manually performing
21380 a bit extract operation on bit 5/6, and then performing the two
21381 shifts. The two methods of getting 0/1 into low/high are exactly
21382 the same size. Avoiding the shift in the bit extract case helps
21383 pentium4 a bit; no one else seems to care much either way. */
21384 else
21386 enum machine_mode half_mode;
21387 rtx (*gen_lshr3)(rtx, rtx, rtx);
21388 rtx (*gen_and3)(rtx, rtx, rtx);
21389 rtx (*gen_xor3)(rtx, rtx, rtx);
21390 HOST_WIDE_INT bits;
21391 rtx x;
21393 if (mode == DImode)
21395 half_mode = SImode;
21396 gen_lshr3 = gen_lshrsi3;
21397 gen_and3 = gen_andsi3;
21398 gen_xor3 = gen_xorsi3;
21399 bits = 5;
21401 else
21403 half_mode = DImode;
21404 gen_lshr3 = gen_lshrdi3;
21405 gen_and3 = gen_anddi3;
21406 gen_xor3 = gen_xordi3;
21407 bits = 6;
21410 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
21411 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
21412 else
21413 x = gen_lowpart (half_mode, operands[2]);
21414 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
21416 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
21417 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
21418 emit_move_insn (low[0], high[0]);
21419 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
21422 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21423 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
21424 return;
21427 if (operands[1] == constm1_rtx)
21429 /* For -1 << N, we can avoid the shld instruction, because we
21430 know that we're shifting 0...31/63 ones into a -1. */
21431 emit_move_insn (low[0], constm1_rtx);
21432 if (optimize_insn_for_size_p ())
21433 emit_move_insn (high[0], low[0]);
21434 else
21435 emit_move_insn (high[0], constm1_rtx);
21437 else
21439 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21441 if (!rtx_equal_p (operands[0], operands[1]))
21442 emit_move_insn (operands[0], operands[1]);
21444 split_double_mode (mode, operands, 1, low, high);
21445 emit_insn (gen_shld (high[0], low[0], operands[2]));
21448 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21450 if (TARGET_CMOVE && scratch)
21452 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21453 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21455 ix86_expand_clear (scratch);
21456 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
21458 else
21460 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21461 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21463 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
21467 void
21468 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
21470 rtx (*gen_ashr3)(rtx, rtx, rtx)
21471 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
21472 rtx (*gen_shrd)(rtx, rtx, rtx);
21473 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21475 rtx low[2], high[2];
21476 int count;
21478 if (CONST_INT_P (operands[2]))
21480 split_double_mode (mode, operands, 2, low, high);
21481 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21483 if (count == GET_MODE_BITSIZE (mode) - 1)
21485 emit_move_insn (high[0], high[1]);
21486 emit_insn (gen_ashr3 (high[0], high[0],
21487 GEN_INT (half_width - 1)));
21488 emit_move_insn (low[0], high[0]);
21491 else if (count >= half_width)
21493 emit_move_insn (low[0], high[1]);
21494 emit_move_insn (high[0], low[0]);
21495 emit_insn (gen_ashr3 (high[0], high[0],
21496 GEN_INT (half_width - 1)));
21498 if (count > half_width)
21499 emit_insn (gen_ashr3 (low[0], low[0],
21500 GEN_INT (count - half_width)));
21502 else
21504 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21506 if (!rtx_equal_p (operands[0], operands[1]))
21507 emit_move_insn (operands[0], operands[1]);
21509 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21510 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
21513 else
21515 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21517 if (!rtx_equal_p (operands[0], operands[1]))
21518 emit_move_insn (operands[0], operands[1]);
21520 split_double_mode (mode, operands, 1, low, high);
21522 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21523 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
21525 if (TARGET_CMOVE && scratch)
21527 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21528 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21530 emit_move_insn (scratch, high[0]);
21531 emit_insn (gen_ashr3 (scratch, scratch,
21532 GEN_INT (half_width - 1)));
21533 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21534 scratch));
21536 else
21538 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
21539 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
21541 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
21546 void
21547 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
21549 rtx (*gen_lshr3)(rtx, rtx, rtx)
21550 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
21551 rtx (*gen_shrd)(rtx, rtx, rtx);
21552 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21554 rtx low[2], high[2];
21555 int count;
21557 if (CONST_INT_P (operands[2]))
21559 split_double_mode (mode, operands, 2, low, high);
21560 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21562 if (count >= half_width)
21564 emit_move_insn (low[0], high[1]);
21565 ix86_expand_clear (high[0]);
21567 if (count > half_width)
21568 emit_insn (gen_lshr3 (low[0], low[0],
21569 GEN_INT (count - half_width)));
21571 else
21573 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21575 if (!rtx_equal_p (operands[0], operands[1]))
21576 emit_move_insn (operands[0], operands[1]);
21578 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21579 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
21582 else
21584 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21586 if (!rtx_equal_p (operands[0], operands[1]))
21587 emit_move_insn (operands[0], operands[1]);
21589 split_double_mode (mode, operands, 1, low, high);
21591 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21592 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
21594 if (TARGET_CMOVE && scratch)
21596 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21597 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21599 ix86_expand_clear (scratch);
21600 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21601 scratch));
21603 else
21605 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21606 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21608 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
21613 /* Predict just emitted jump instruction to be taken with probability PROB. */
21614 static void
21615 predict_jump (int prob)
21617 rtx insn = get_last_insn ();
21618 gcc_assert (JUMP_P (insn));
21619 add_reg_note (insn, REG_BR_PROB, GEN_INT (prob));
21622 /* Helper function for the string operations below. Dest VARIABLE whether
21623 it is aligned to VALUE bytes. If true, jump to the label. */
21624 static rtx
21625 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
21627 rtx label = gen_label_rtx ();
21628 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
21629 if (GET_MODE (variable) == DImode)
21630 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
21631 else
21632 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
21633 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
21634 1, label);
21635 if (epilogue)
21636 predict_jump (REG_BR_PROB_BASE * 50 / 100);
21637 else
21638 predict_jump (REG_BR_PROB_BASE * 90 / 100);
21639 return label;
21642 /* Adjust COUNTER by the VALUE. */
21643 static void
21644 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
21646 rtx (*gen_add)(rtx, rtx, rtx)
21647 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
21649 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
21652 /* Zero extend possibly SImode EXP to Pmode register. */
21654 ix86_zero_extend_to_Pmode (rtx exp)
21656 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
21659 /* Divide COUNTREG by SCALE. */
21660 static rtx
21661 scale_counter (rtx countreg, int scale)
21663 rtx sc;
21665 if (scale == 1)
21666 return countreg;
21667 if (CONST_INT_P (countreg))
21668 return GEN_INT (INTVAL (countreg) / scale);
21669 gcc_assert (REG_P (countreg));
21671 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
21672 GEN_INT (exact_log2 (scale)),
21673 NULL, 1, OPTAB_DIRECT);
21674 return sc;
21677 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
21678 DImode for constant loop counts. */
21680 static enum machine_mode
21681 counter_mode (rtx count_exp)
21683 if (GET_MODE (count_exp) != VOIDmode)
21684 return GET_MODE (count_exp);
21685 if (!CONST_INT_P (count_exp))
21686 return Pmode;
21687 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
21688 return DImode;
21689 return SImode;
21692 /* When SRCPTR is non-NULL, output simple loop to move memory
21693 pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
21694 overall size is COUNT specified in bytes. When SRCPTR is NULL, output the
21695 equivalent loop to set memory by VALUE (supposed to be in MODE).
21697 The size is rounded down to whole number of chunk size moved at once.
21698 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
21701 static void
21702 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
21703 rtx destptr, rtx srcptr, rtx value,
21704 rtx count, enum machine_mode mode, int unroll,
21705 int expected_size)
21707 rtx out_label, top_label, iter, tmp;
21708 enum machine_mode iter_mode = counter_mode (count);
21709 rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
21710 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
21711 rtx size;
21712 rtx x_addr;
21713 rtx y_addr;
21714 int i;
21716 top_label = gen_label_rtx ();
21717 out_label = gen_label_rtx ();
21718 iter = gen_reg_rtx (iter_mode);
21720 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
21721 NULL, 1, OPTAB_DIRECT);
21722 /* Those two should combine. */
21723 if (piece_size == const1_rtx)
21725 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
21726 true, out_label);
21727 predict_jump (REG_BR_PROB_BASE * 10 / 100);
21729 emit_move_insn (iter, const0_rtx);
21731 emit_label (top_label);
21733 tmp = convert_modes (Pmode, iter_mode, iter, true);
21734 x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
21735 destmem = change_address (destmem, mode, x_addr);
21737 if (srcmem)
21739 y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
21740 srcmem = change_address (srcmem, mode, y_addr);
21742 /* When unrolling for chips that reorder memory reads and writes,
21743 we can save registers by using single temporary.
21744 Also using 4 temporaries is overkill in 32bit mode. */
21745 if (!TARGET_64BIT && 0)
21747 for (i = 0; i < unroll; i++)
21749 if (i)
21751 destmem =
21752 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21753 srcmem =
21754 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21756 emit_move_insn (destmem, srcmem);
21759 else
21761 rtx tmpreg[4];
21762 gcc_assert (unroll <= 4);
21763 for (i = 0; i < unroll; i++)
21765 tmpreg[i] = gen_reg_rtx (mode);
21766 if (i)
21768 srcmem =
21769 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21771 emit_move_insn (tmpreg[i], srcmem);
21773 for (i = 0; i < unroll; i++)
21775 if (i)
21777 destmem =
21778 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21780 emit_move_insn (destmem, tmpreg[i]);
21784 else
21785 for (i = 0; i < unroll; i++)
21787 if (i)
21788 destmem =
21789 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21790 emit_move_insn (destmem, value);
21793 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
21794 true, OPTAB_LIB_WIDEN);
21795 if (tmp != iter)
21796 emit_move_insn (iter, tmp);
21798 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
21799 true, top_label);
21800 if (expected_size != -1)
21802 expected_size /= GET_MODE_SIZE (mode) * unroll;
21803 if (expected_size == 0)
21804 predict_jump (0);
21805 else if (expected_size > REG_BR_PROB_BASE)
21806 predict_jump (REG_BR_PROB_BASE - 1);
21807 else
21808 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
21810 else
21811 predict_jump (REG_BR_PROB_BASE * 80 / 100);
21812 iter = ix86_zero_extend_to_Pmode (iter);
21813 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
21814 true, OPTAB_LIB_WIDEN);
21815 if (tmp != destptr)
21816 emit_move_insn (destptr, tmp);
21817 if (srcptr)
21819 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
21820 true, OPTAB_LIB_WIDEN);
21821 if (tmp != srcptr)
21822 emit_move_insn (srcptr, tmp);
21824 emit_label (out_label);
21827 /* Output "rep; mov" instruction.
21828 Arguments have same meaning as for previous function */
21829 static void
21830 expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
21831 rtx destptr, rtx srcptr,
21832 rtx count,
21833 enum machine_mode mode)
21835 rtx destexp;
21836 rtx srcexp;
21837 rtx countreg;
21838 HOST_WIDE_INT rounded_count;
21840 /* If the size is known, it is shorter to use rep movs. */
21841 if (mode == QImode && CONST_INT_P (count)
21842 && !(INTVAL (count) & 3))
21843 mode = SImode;
21845 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21846 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21847 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
21848 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
21849 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21850 if (mode != QImode)
21852 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21853 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21854 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21855 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
21856 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21857 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
21859 else
21861 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21862 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
21864 if (CONST_INT_P (count))
21866 rounded_count = (INTVAL (count)
21867 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21868 destmem = shallow_copy_rtx (destmem);
21869 srcmem = shallow_copy_rtx (srcmem);
21870 set_mem_size (destmem, rounded_count);
21871 set_mem_size (srcmem, rounded_count);
21873 else
21875 if (MEM_SIZE_KNOWN_P (destmem))
21876 clear_mem_size (destmem);
21877 if (MEM_SIZE_KNOWN_P (srcmem))
21878 clear_mem_size (srcmem);
21880 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
21881 destexp, srcexp));
21884 /* Output "rep; stos" instruction.
21885 Arguments have same meaning as for previous function */
21886 static void
21887 expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
21888 rtx count, enum machine_mode mode,
21889 rtx orig_value)
21891 rtx destexp;
21892 rtx countreg;
21893 HOST_WIDE_INT rounded_count;
21895 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21896 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21897 value = force_reg (mode, gen_lowpart (mode, value));
21898 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21899 if (mode != QImode)
21901 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21902 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21903 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21905 else
21906 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21907 if (orig_value == const0_rtx && CONST_INT_P (count))
21909 rounded_count = (INTVAL (count)
21910 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21911 destmem = shallow_copy_rtx (destmem);
21912 set_mem_size (destmem, rounded_count);
21914 else if (MEM_SIZE_KNOWN_P (destmem))
21915 clear_mem_size (destmem);
21916 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
21919 static void
21920 emit_strmov (rtx destmem, rtx srcmem,
21921 rtx destptr, rtx srcptr, enum machine_mode mode, int offset)
21923 rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset);
21924 rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
21925 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21928 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
21929 static void
21930 expand_movmem_epilogue (rtx destmem, rtx srcmem,
21931 rtx destptr, rtx srcptr, rtx count, int max_size)
21933 rtx src, dest;
21934 if (CONST_INT_P (count))
21936 HOST_WIDE_INT countval = INTVAL (count);
21937 int offset = 0;
21939 if ((countval & 0x10) && max_size > 16)
21941 if (TARGET_64BIT)
21943 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21944 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
21946 else
21947 gcc_unreachable ();
21948 offset += 16;
21950 if ((countval & 0x08) && max_size > 8)
21952 if (TARGET_64BIT)
21953 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21954 else
21956 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21957 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset + 4);
21959 offset += 8;
21961 if ((countval & 0x04) && max_size > 4)
21963 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21964 offset += 4;
21966 if ((countval & 0x02) && max_size > 2)
21968 emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
21969 offset += 2;
21971 if ((countval & 0x01) && max_size > 1)
21973 emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
21974 offset += 1;
21976 return;
21978 if (max_size > 8)
21980 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
21981 count, 1, OPTAB_DIRECT);
21982 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
21983 count, QImode, 1, 4);
21984 return;
21987 /* When there are stringops, we can cheaply increase dest and src pointers.
21988 Otherwise we save code size by maintaining offset (zero is readily
21989 available from preceding rep operation) and using x86 addressing modes.
21991 if (TARGET_SINGLE_STRINGOP)
21993 if (max_size > 4)
21995 rtx label = ix86_expand_aligntest (count, 4, true);
21996 src = change_address (srcmem, SImode, srcptr);
21997 dest = change_address (destmem, SImode, destptr);
21998 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21999 emit_label (label);
22000 LABEL_NUSES (label) = 1;
22002 if (max_size > 2)
22004 rtx label = ix86_expand_aligntest (count, 2, true);
22005 src = change_address (srcmem, HImode, srcptr);
22006 dest = change_address (destmem, HImode, destptr);
22007 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22008 emit_label (label);
22009 LABEL_NUSES (label) = 1;
22011 if (max_size > 1)
22013 rtx label = ix86_expand_aligntest (count, 1, true);
22014 src = change_address (srcmem, QImode, srcptr);
22015 dest = change_address (destmem, QImode, destptr);
22016 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22017 emit_label (label);
22018 LABEL_NUSES (label) = 1;
22021 else
22023 rtx offset = force_reg (Pmode, const0_rtx);
22024 rtx tmp;
22026 if (max_size > 4)
22028 rtx label = ix86_expand_aligntest (count, 4, true);
22029 src = change_address (srcmem, SImode, srcptr);
22030 dest = change_address (destmem, SImode, destptr);
22031 emit_move_insn (dest, src);
22032 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
22033 true, OPTAB_LIB_WIDEN);
22034 if (tmp != offset)
22035 emit_move_insn (offset, tmp);
22036 emit_label (label);
22037 LABEL_NUSES (label) = 1;
22039 if (max_size > 2)
22041 rtx label = ix86_expand_aligntest (count, 2, true);
22042 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22043 src = change_address (srcmem, HImode, tmp);
22044 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22045 dest = change_address (destmem, HImode, tmp);
22046 emit_move_insn (dest, src);
22047 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
22048 true, OPTAB_LIB_WIDEN);
22049 if (tmp != offset)
22050 emit_move_insn (offset, tmp);
22051 emit_label (label);
22052 LABEL_NUSES (label) = 1;
22054 if (max_size > 1)
22056 rtx label = ix86_expand_aligntest (count, 1, true);
22057 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22058 src = change_address (srcmem, QImode, tmp);
22059 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22060 dest = change_address (destmem, QImode, tmp);
22061 emit_move_insn (dest, src);
22062 emit_label (label);
22063 LABEL_NUSES (label) = 1;
22068 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22069 static void
22070 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
22071 rtx count, int max_size)
22073 count =
22074 expand_simple_binop (counter_mode (count), AND, count,
22075 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
22076 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
22077 gen_lowpart (QImode, value), count, QImode,
22078 1, max_size / 2);
22081 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22082 static void
22083 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size)
22085 rtx dest;
22087 if (CONST_INT_P (count))
22089 HOST_WIDE_INT countval = INTVAL (count);
22090 int offset = 0;
22092 if ((countval & 0x10) && max_size > 16)
22094 if (TARGET_64BIT)
22096 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
22097 emit_insn (gen_strset (destptr, dest, value));
22098 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8);
22099 emit_insn (gen_strset (destptr, dest, value));
22101 else
22102 gcc_unreachable ();
22103 offset += 16;
22105 if ((countval & 0x08) && max_size > 8)
22107 if (TARGET_64BIT)
22109 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
22110 emit_insn (gen_strset (destptr, dest, value));
22112 else
22114 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
22115 emit_insn (gen_strset (destptr, dest, value));
22116 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4);
22117 emit_insn (gen_strset (destptr, dest, value));
22119 offset += 8;
22121 if ((countval & 0x04) && max_size > 4)
22123 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
22124 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
22125 offset += 4;
22127 if ((countval & 0x02) && max_size > 2)
22129 dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset);
22130 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
22131 offset += 2;
22133 if ((countval & 0x01) && max_size > 1)
22135 dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset);
22136 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
22137 offset += 1;
22139 return;
22141 if (max_size > 32)
22143 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
22144 return;
22146 if (max_size > 16)
22148 rtx label = ix86_expand_aligntest (count, 16, true);
22149 if (TARGET_64BIT)
22151 dest = change_address (destmem, DImode, destptr);
22152 emit_insn (gen_strset (destptr, dest, value));
22153 emit_insn (gen_strset (destptr, dest, value));
22155 else
22157 dest = change_address (destmem, SImode, destptr);
22158 emit_insn (gen_strset (destptr, dest, value));
22159 emit_insn (gen_strset (destptr, dest, value));
22160 emit_insn (gen_strset (destptr, dest, value));
22161 emit_insn (gen_strset (destptr, dest, value));
22163 emit_label (label);
22164 LABEL_NUSES (label) = 1;
22166 if (max_size > 8)
22168 rtx label = ix86_expand_aligntest (count, 8, true);
22169 if (TARGET_64BIT)
22171 dest = change_address (destmem, DImode, destptr);
22172 emit_insn (gen_strset (destptr, dest, value));
22174 else
22176 dest = change_address (destmem, SImode, destptr);
22177 emit_insn (gen_strset (destptr, dest, value));
22178 emit_insn (gen_strset (destptr, dest, value));
22180 emit_label (label);
22181 LABEL_NUSES (label) = 1;
22183 if (max_size > 4)
22185 rtx label = ix86_expand_aligntest (count, 4, true);
22186 dest = change_address (destmem, SImode, destptr);
22187 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
22188 emit_label (label);
22189 LABEL_NUSES (label) = 1;
22191 if (max_size > 2)
22193 rtx label = ix86_expand_aligntest (count, 2, true);
22194 dest = change_address (destmem, HImode, destptr);
22195 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
22196 emit_label (label);
22197 LABEL_NUSES (label) = 1;
22199 if (max_size > 1)
22201 rtx label = ix86_expand_aligntest (count, 1, true);
22202 dest = change_address (destmem, QImode, destptr);
22203 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
22204 emit_label (label);
22205 LABEL_NUSES (label) = 1;
22209 /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
22210 DESIRED_ALIGNMENT. */
22211 static void
22212 expand_movmem_prologue (rtx destmem, rtx srcmem,
22213 rtx destptr, rtx srcptr, rtx count,
22214 int align, int desired_alignment)
22216 if (align <= 1 && desired_alignment > 1)
22218 rtx label = ix86_expand_aligntest (destptr, 1, false);
22219 srcmem = change_address (srcmem, QImode, srcptr);
22220 destmem = change_address (destmem, QImode, destptr);
22221 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
22222 ix86_adjust_counter (count, 1);
22223 emit_label (label);
22224 LABEL_NUSES (label) = 1;
22226 if (align <= 2 && desired_alignment > 2)
22228 rtx label = ix86_expand_aligntest (destptr, 2, false);
22229 srcmem = change_address (srcmem, HImode, srcptr);
22230 destmem = change_address (destmem, HImode, destptr);
22231 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
22232 ix86_adjust_counter (count, 2);
22233 emit_label (label);
22234 LABEL_NUSES (label) = 1;
22236 if (align <= 4 && desired_alignment > 4)
22238 rtx label = ix86_expand_aligntest (destptr, 4, false);
22239 srcmem = change_address (srcmem, SImode, srcptr);
22240 destmem = change_address (destmem, SImode, destptr);
22241 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
22242 ix86_adjust_counter (count, 4);
22243 emit_label (label);
22244 LABEL_NUSES (label) = 1;
22246 gcc_assert (desired_alignment <= 8);
22249 /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
22250 ALIGN_BYTES is how many bytes need to be copied. */
22251 static rtx
22252 expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
22253 int desired_align, int align_bytes)
22255 rtx src = *srcp;
22256 rtx orig_dst = dst;
22257 rtx orig_src = src;
22258 int off = 0;
22259 int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
22260 if (src_align_bytes >= 0)
22261 src_align_bytes = desired_align - src_align_bytes;
22262 if (align_bytes & 1)
22264 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
22265 src = adjust_automodify_address_nv (src, QImode, srcreg, 0);
22266 off = 1;
22267 emit_insn (gen_strmov (destreg, dst, srcreg, src));
22269 if (align_bytes & 2)
22271 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
22272 src = adjust_automodify_address_nv (src, HImode, srcreg, off);
22273 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
22274 set_mem_align (dst, 2 * BITS_PER_UNIT);
22275 if (src_align_bytes >= 0
22276 && (src_align_bytes & 1) == (align_bytes & 1)
22277 && MEM_ALIGN (src) < 2 * BITS_PER_UNIT)
22278 set_mem_align (src, 2 * BITS_PER_UNIT);
22279 off = 2;
22280 emit_insn (gen_strmov (destreg, dst, srcreg, src));
22282 if (align_bytes & 4)
22284 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
22285 src = adjust_automodify_address_nv (src, SImode, srcreg, off);
22286 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
22287 set_mem_align (dst, 4 * BITS_PER_UNIT);
22288 if (src_align_bytes >= 0)
22290 unsigned int src_align = 0;
22291 if ((src_align_bytes & 3) == (align_bytes & 3))
22292 src_align = 4;
22293 else if ((src_align_bytes & 1) == (align_bytes & 1))
22294 src_align = 2;
22295 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
22296 set_mem_align (src, src_align * BITS_PER_UNIT);
22298 off = 4;
22299 emit_insn (gen_strmov (destreg, dst, srcreg, src));
22301 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
22302 src = adjust_automodify_address_nv (src, BLKmode, srcreg, off);
22303 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
22304 set_mem_align (dst, desired_align * BITS_PER_UNIT);
22305 if (src_align_bytes >= 0)
22307 unsigned int src_align = 0;
22308 if ((src_align_bytes & 7) == (align_bytes & 7))
22309 src_align = 8;
22310 else if ((src_align_bytes & 3) == (align_bytes & 3))
22311 src_align = 4;
22312 else if ((src_align_bytes & 1) == (align_bytes & 1))
22313 src_align = 2;
22314 if (src_align > (unsigned int) desired_align)
22315 src_align = desired_align;
22316 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
22317 set_mem_align (src, src_align * BITS_PER_UNIT);
22319 if (MEM_SIZE_KNOWN_P (orig_dst))
22320 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
22321 if (MEM_SIZE_KNOWN_P (orig_src))
22322 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
22323 *srcp = src;
22324 return dst;
22327 /* Set enough from DEST to align DEST known to by aligned by ALIGN to
22328 DESIRED_ALIGNMENT. */
22329 static void
22330 expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
22331 int align, int desired_alignment)
22333 if (align <= 1 && desired_alignment > 1)
22335 rtx label = ix86_expand_aligntest (destptr, 1, false);
22336 destmem = change_address (destmem, QImode, destptr);
22337 emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
22338 ix86_adjust_counter (count, 1);
22339 emit_label (label);
22340 LABEL_NUSES (label) = 1;
22342 if (align <= 2 && desired_alignment > 2)
22344 rtx label = ix86_expand_aligntest (destptr, 2, false);
22345 destmem = change_address (destmem, HImode, destptr);
22346 emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
22347 ix86_adjust_counter (count, 2);
22348 emit_label (label);
22349 LABEL_NUSES (label) = 1;
22351 if (align <= 4 && desired_alignment > 4)
22353 rtx label = ix86_expand_aligntest (destptr, 4, false);
22354 destmem = change_address (destmem, SImode, destptr);
22355 emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
22356 ix86_adjust_counter (count, 4);
22357 emit_label (label);
22358 LABEL_NUSES (label) = 1;
22360 gcc_assert (desired_alignment <= 8);
22363 /* Set enough from DST to align DST known to by aligned by ALIGN to
22364 DESIRED_ALIGN. ALIGN_BYTES is how many bytes need to be stored. */
22365 static rtx
22366 expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
22367 int desired_align, int align_bytes)
22369 int off = 0;
22370 rtx orig_dst = dst;
22371 if (align_bytes & 1)
22373 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
22374 off = 1;
22375 emit_insn (gen_strset (destreg, dst,
22376 gen_lowpart (QImode, value)));
22378 if (align_bytes & 2)
22380 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
22381 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
22382 set_mem_align (dst, 2 * BITS_PER_UNIT);
22383 off = 2;
22384 emit_insn (gen_strset (destreg, dst,
22385 gen_lowpart (HImode, value)));
22387 if (align_bytes & 4)
22389 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
22390 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
22391 set_mem_align (dst, 4 * BITS_PER_UNIT);
22392 off = 4;
22393 emit_insn (gen_strset (destreg, dst,
22394 gen_lowpart (SImode, value)));
22396 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
22397 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
22398 set_mem_align (dst, desired_align * BITS_PER_UNIT);
22399 if (MEM_SIZE_KNOWN_P (orig_dst))
22400 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
22401 return dst;
22404 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
22405 static enum stringop_alg
22406 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
22407 int *dynamic_check, bool *noalign)
22409 const struct stringop_algs * algs;
22410 bool optimize_for_speed;
22411 /* Algorithms using the rep prefix want at least edi and ecx;
22412 additionally, memset wants eax and memcpy wants esi. Don't
22413 consider such algorithms if the user has appropriated those
22414 registers for their own purposes. */
22415 bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
22416 || (memset
22417 ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
22418 *noalign = false;
22420 #define ALG_USABLE_P(alg) (rep_prefix_usable \
22421 || (alg != rep_prefix_1_byte \
22422 && alg != rep_prefix_4_byte \
22423 && alg != rep_prefix_8_byte))
22424 const struct processor_costs *cost;
22426 /* Even if the string operation call is cold, we still might spend a lot
22427 of time processing large blocks. */
22428 if (optimize_function_for_size_p (cfun)
22429 || (optimize_insn_for_size_p ()
22430 && expected_size != -1 && expected_size < 256))
22431 optimize_for_speed = false;
22432 else
22433 optimize_for_speed = true;
22435 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
22437 *dynamic_check = -1;
22438 if (memset)
22439 algs = &cost->memset[TARGET_64BIT != 0];
22440 else
22441 algs = &cost->memcpy[TARGET_64BIT != 0];
22442 if (ix86_stringop_alg != no_stringop && ALG_USABLE_P (ix86_stringop_alg))
22443 return ix86_stringop_alg;
22444 /* rep; movq or rep; movl is the smallest variant. */
22445 else if (!optimize_for_speed)
22447 if (!count || (count & 3))
22448 return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
22449 else
22450 return rep_prefix_usable ? rep_prefix_4_byte : loop;
22452 /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
22454 else if (expected_size != -1 && expected_size < 4)
22455 return loop_1_byte;
22456 else if (expected_size != -1)
22458 unsigned int i;
22459 enum stringop_alg alg = libcall;
22460 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22462 /* We get here if the algorithms that were not libcall-based
22463 were rep-prefix based and we are unable to use rep prefixes
22464 based on global register usage. Break out of the loop and
22465 use the heuristic below. */
22466 if (algs->size[i].max == 0)
22467 break;
22468 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
22470 enum stringop_alg candidate = algs->size[i].alg;
22472 if (candidate != libcall && ALG_USABLE_P (candidate))
22473 alg = candidate;
22474 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
22475 last non-libcall inline algorithm. */
22476 if (TARGET_INLINE_ALL_STRINGOPS)
22478 /* When the current size is best to be copied by a libcall,
22479 but we are still forced to inline, run the heuristic below
22480 that will pick code for medium sized blocks. */
22481 if (alg != libcall)
22482 return alg;
22483 break;
22485 else if (ALG_USABLE_P (candidate))
22487 *noalign = algs->size[i].noalign;
22488 return candidate;
22492 gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
22494 /* When asked to inline the call anyway, try to pick meaningful choice.
22495 We look for maximal size of block that is faster to copy by hand and
22496 take blocks of at most of that size guessing that average size will
22497 be roughly half of the block.
22499 If this turns out to be bad, we might simply specify the preferred
22500 choice in ix86_costs. */
22501 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22502 && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size)))
22504 int max = -1;
22505 enum stringop_alg alg;
22506 int i;
22507 bool any_alg_usable_p = true;
22509 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22511 enum stringop_alg candidate = algs->size[i].alg;
22512 any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
22514 if (candidate != libcall && candidate
22515 && ALG_USABLE_P (candidate))
22516 max = algs->size[i].max;
22518 /* If there aren't any usable algorithms, then recursing on
22519 smaller sizes isn't going to find anything. Just return the
22520 simple byte-at-a-time copy loop. */
22521 if (!any_alg_usable_p)
22523 /* Pick something reasonable. */
22524 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22525 *dynamic_check = 128;
22526 return loop_1_byte;
22528 if (max == -1)
22529 max = 4096;
22530 alg = decide_alg (count, max / 2, memset, dynamic_check, noalign);
22531 gcc_assert (*dynamic_check == -1);
22532 gcc_assert (alg != libcall);
22533 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22534 *dynamic_check = max;
22535 return alg;
22537 return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
22538 #undef ALG_USABLE_P
22541 /* Decide on alignment. We know that the operand is already aligned to ALIGN
22542 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
22543 static int
22544 decide_alignment (int align,
22545 enum stringop_alg alg,
22546 int expected_size)
22548 int desired_align = 0;
22549 switch (alg)
22551 case no_stringop:
22552 gcc_unreachable ();
22553 case loop:
22554 case unrolled_loop:
22555 desired_align = GET_MODE_SIZE (Pmode);
22556 break;
22557 case rep_prefix_8_byte:
22558 desired_align = 8;
22559 break;
22560 case rep_prefix_4_byte:
22561 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22562 copying whole cacheline at once. */
22563 if (TARGET_PENTIUMPRO)
22564 desired_align = 8;
22565 else
22566 desired_align = 4;
22567 break;
22568 case rep_prefix_1_byte:
22569 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22570 copying whole cacheline at once. */
22571 if (TARGET_PENTIUMPRO)
22572 desired_align = 8;
22573 else
22574 desired_align = 1;
22575 break;
22576 case loop_1_byte:
22577 desired_align = 1;
22578 break;
22579 case libcall:
22580 return 0;
22583 if (optimize_size)
22584 desired_align = 1;
22585 if (desired_align < align)
22586 desired_align = align;
22587 if (expected_size != -1 && expected_size < 4)
22588 desired_align = align;
22589 return desired_align;
22592 /* Return the smallest power of 2 greater than VAL. */
22593 static int
22594 smallest_pow2_greater_than (int val)
22596 int ret = 1;
22597 while (ret <= val)
22598 ret <<= 1;
22599 return ret;
22602 /* Expand string move (memcpy) operation. Use i386 string operations
22603 when profitable. expand_setmem contains similar code. The code
22604 depends upon architecture, block size and alignment, but always has
22605 the same overall structure:
22607 1) Prologue guard: Conditional that jumps up to epilogues for small
22608 blocks that can be handled by epilogue alone. This is faster
22609 but also needed for correctness, since prologue assume the block
22610 is larger than the desired alignment.
22612 Optional dynamic check for size and libcall for large
22613 blocks is emitted here too, with -minline-stringops-dynamically.
22615 2) Prologue: copy first few bytes in order to get destination
22616 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
22617 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
22618 copied. We emit either a jump tree on power of two sized
22619 blocks, or a byte loop.
22621 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
22622 with specified algorithm.
22624 4) Epilogue: code copying tail of the block that is too small to be
22625 handled by main body (or up to size guarded by prologue guard). */
22627 bool
22628 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
22629 rtx expected_align_exp, rtx expected_size_exp)
22631 rtx destreg;
22632 rtx srcreg;
22633 rtx label = NULL;
22634 rtx tmp;
22635 rtx jump_around_label = NULL;
22636 HOST_WIDE_INT align = 1;
22637 unsigned HOST_WIDE_INT count = 0;
22638 HOST_WIDE_INT expected_size = -1;
22639 int size_needed = 0, epilogue_size_needed;
22640 int desired_align = 0, align_bytes = 0;
22641 enum stringop_alg alg;
22642 int dynamic_check;
22643 bool need_zero_guard = false;
22644 bool noalign;
22646 if (CONST_INT_P (align_exp))
22647 align = INTVAL (align_exp);
22648 /* i386 can do misaligned access on reasonably increased cost. */
22649 if (CONST_INT_P (expected_align_exp)
22650 && INTVAL (expected_align_exp) > align)
22651 align = INTVAL (expected_align_exp);
22652 /* ALIGN is the minimum of destination and source alignment, but we care here
22653 just about destination alignment. */
22654 else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
22655 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
22657 if (CONST_INT_P (count_exp))
22658 count = expected_size = INTVAL (count_exp);
22659 if (CONST_INT_P (expected_size_exp) && count == 0)
22660 expected_size = INTVAL (expected_size_exp);
22662 /* Make sure we don't need to care about overflow later on. */
22663 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22664 return false;
22666 /* Step 0: Decide on preferred algorithm, desired alignment and
22667 size of chunks to be copied by main loop. */
22669 alg = decide_alg (count, expected_size, false, &dynamic_check, &noalign);
22670 desired_align = decide_alignment (align, alg, expected_size);
22672 if (!TARGET_ALIGN_STRINGOPS || noalign)
22673 align = desired_align;
22675 if (alg == libcall)
22676 return false;
22677 gcc_assert (alg != no_stringop);
22678 if (!count)
22679 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
22680 destreg = copy_addr_to_reg (XEXP (dst, 0));
22681 srcreg = copy_addr_to_reg (XEXP (src, 0));
22682 switch (alg)
22684 case libcall:
22685 case no_stringop:
22686 gcc_unreachable ();
22687 case loop:
22688 need_zero_guard = true;
22689 size_needed = GET_MODE_SIZE (word_mode);
22690 break;
22691 case unrolled_loop:
22692 need_zero_guard = true;
22693 size_needed = GET_MODE_SIZE (word_mode) * (TARGET_64BIT ? 4 : 2);
22694 break;
22695 case rep_prefix_8_byte:
22696 size_needed = 8;
22697 break;
22698 case rep_prefix_4_byte:
22699 size_needed = 4;
22700 break;
22701 case rep_prefix_1_byte:
22702 size_needed = 1;
22703 break;
22704 case loop_1_byte:
22705 need_zero_guard = true;
22706 size_needed = 1;
22707 break;
22710 epilogue_size_needed = size_needed;
22712 /* Step 1: Prologue guard. */
22714 /* Alignment code needs count to be in register. */
22715 if (CONST_INT_P (count_exp) && desired_align > align)
22717 if (INTVAL (count_exp) > desired_align
22718 && INTVAL (count_exp) > size_needed)
22720 align_bytes
22721 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22722 if (align_bytes <= 0)
22723 align_bytes = 0;
22724 else
22725 align_bytes = desired_align - align_bytes;
22727 if (align_bytes == 0)
22728 count_exp = force_reg (counter_mode (count_exp), count_exp);
22730 gcc_assert (desired_align >= 1 && align >= 1);
22732 /* Ensure that alignment prologue won't copy past end of block. */
22733 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22735 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22736 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
22737 Make sure it is power of 2. */
22738 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22740 if (count)
22742 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22744 /* If main algorithm works on QImode, no epilogue is needed.
22745 For small sizes just don't align anything. */
22746 if (size_needed == 1)
22747 desired_align = align;
22748 else
22749 goto epilogue;
22752 else
22754 label = gen_label_rtx ();
22755 emit_cmp_and_jump_insns (count_exp,
22756 GEN_INT (epilogue_size_needed),
22757 LTU, 0, counter_mode (count_exp), 1, label);
22758 if (expected_size == -1 || expected_size < epilogue_size_needed)
22759 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22760 else
22761 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22765 /* Emit code to decide on runtime whether library call or inline should be
22766 used. */
22767 if (dynamic_check != -1)
22769 if (CONST_INT_P (count_exp))
22771 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
22773 emit_block_move_via_libcall (dst, src, count_exp, false);
22774 count_exp = const0_rtx;
22775 goto epilogue;
22778 else
22780 rtx hot_label = gen_label_rtx ();
22781 jump_around_label = gen_label_rtx ();
22782 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22783 LEU, 0, GET_MODE (count_exp), 1, hot_label);
22784 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22785 emit_block_move_via_libcall (dst, src, count_exp, false);
22786 emit_jump (jump_around_label);
22787 emit_label (hot_label);
22791 /* Step 2: Alignment prologue. */
22793 if (desired_align > align)
22795 if (align_bytes == 0)
22797 /* Except for the first move in epilogue, we no longer know
22798 constant offset in aliasing info. It don't seems to worth
22799 the pain to maintain it for the first move, so throw away
22800 the info early. */
22801 src = change_address (src, BLKmode, srcreg);
22802 dst = change_address (dst, BLKmode, destreg);
22803 expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
22804 desired_align);
22806 else
22808 /* If we know how many bytes need to be stored before dst is
22809 sufficiently aligned, maintain aliasing info accurately. */
22810 dst = expand_constant_movmem_prologue (dst, &src, destreg, srcreg,
22811 desired_align, align_bytes);
22812 count_exp = plus_constant (counter_mode (count_exp),
22813 count_exp, -align_bytes);
22814 count -= align_bytes;
22816 if (need_zero_guard
22817 && (count < (unsigned HOST_WIDE_INT) size_needed
22818 || (align_bytes == 0
22819 && count < ((unsigned HOST_WIDE_INT) size_needed
22820 + desired_align - align))))
22822 /* It is possible that we copied enough so the main loop will not
22823 execute. */
22824 gcc_assert (size_needed > 1);
22825 if (label == NULL_RTX)
22826 label = gen_label_rtx ();
22827 emit_cmp_and_jump_insns (count_exp,
22828 GEN_INT (size_needed),
22829 LTU, 0, counter_mode (count_exp), 1, label);
22830 if (expected_size == -1
22831 || expected_size < (desired_align - align) / 2 + size_needed)
22832 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22833 else
22834 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22837 if (label && size_needed == 1)
22839 emit_label (label);
22840 LABEL_NUSES (label) = 1;
22841 label = NULL;
22842 epilogue_size_needed = 1;
22844 else if (label == NULL_RTX)
22845 epilogue_size_needed = size_needed;
22847 /* Step 3: Main loop. */
22849 switch (alg)
22851 case libcall:
22852 case no_stringop:
22853 gcc_unreachable ();
22854 case loop_1_byte:
22855 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22856 count_exp, QImode, 1, expected_size);
22857 break;
22858 case loop:
22859 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22860 count_exp, word_mode, 1, expected_size);
22861 break;
22862 case unrolled_loop:
22863 /* Unroll only by factor of 2 in 32bit mode, since we don't have enough
22864 registers for 4 temporaries anyway. */
22865 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22866 count_exp, word_mode, TARGET_64BIT ? 4 : 2,
22867 expected_size);
22868 break;
22869 case rep_prefix_8_byte:
22870 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22871 DImode);
22872 break;
22873 case rep_prefix_4_byte:
22874 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22875 SImode);
22876 break;
22877 case rep_prefix_1_byte:
22878 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22879 QImode);
22880 break;
22882 /* Adjust properly the offset of src and dest memory for aliasing. */
22883 if (CONST_INT_P (count_exp))
22885 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
22886 (count / size_needed) * size_needed);
22887 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22888 (count / size_needed) * size_needed);
22890 else
22892 src = change_address (src, BLKmode, srcreg);
22893 dst = change_address (dst, BLKmode, destreg);
22896 /* Step 4: Epilogue to copy the remaining bytes. */
22897 epilogue:
22898 if (label)
22900 /* When the main loop is done, COUNT_EXP might hold original count,
22901 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22902 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22903 bytes. Compensate if needed. */
22905 if (size_needed < epilogue_size_needed)
22907 tmp =
22908 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22909 GEN_INT (size_needed - 1), count_exp, 1,
22910 OPTAB_DIRECT);
22911 if (tmp != count_exp)
22912 emit_move_insn (count_exp, tmp);
22914 emit_label (label);
22915 LABEL_NUSES (label) = 1;
22918 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22919 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
22920 epilogue_size_needed);
22921 if (jump_around_label)
22922 emit_label (jump_around_label);
22923 return true;
22926 /* Helper function for memcpy. For QImode value 0xXY produce
22927 0xXYXYXYXY of wide specified by MODE. This is essentially
22928 a * 0x10101010, but we can do slightly better than
22929 synth_mult by unwinding the sequence by hand on CPUs with
22930 slow multiply. */
22931 static rtx
22932 promote_duplicated_reg (enum machine_mode mode, rtx val)
22934 enum machine_mode valmode = GET_MODE (val);
22935 rtx tmp;
22936 int nops = mode == DImode ? 3 : 2;
22938 gcc_assert (mode == SImode || mode == DImode);
22939 if (val == const0_rtx)
22940 return copy_to_mode_reg (mode, const0_rtx);
22941 if (CONST_INT_P (val))
22943 HOST_WIDE_INT v = INTVAL (val) & 255;
22945 v |= v << 8;
22946 v |= v << 16;
22947 if (mode == DImode)
22948 v |= (v << 16) << 16;
22949 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
22952 if (valmode == VOIDmode)
22953 valmode = QImode;
22954 if (valmode != QImode)
22955 val = gen_lowpart (QImode, val);
22956 if (mode == QImode)
22957 return val;
22958 if (!TARGET_PARTIAL_REG_STALL)
22959 nops--;
22960 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
22961 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
22962 <= (ix86_cost->shift_const + ix86_cost->add) * nops
22963 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
22965 rtx reg = convert_modes (mode, QImode, val, true);
22966 tmp = promote_duplicated_reg (mode, const1_rtx);
22967 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
22968 OPTAB_DIRECT);
22970 else
22972 rtx reg = convert_modes (mode, QImode, val, true);
22974 if (!TARGET_PARTIAL_REG_STALL)
22975 if (mode == SImode)
22976 emit_insn (gen_movsi_insv_1 (reg, reg));
22977 else
22978 emit_insn (gen_movdi_insv_1 (reg, reg));
22979 else
22981 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
22982 NULL, 1, OPTAB_DIRECT);
22983 reg =
22984 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22986 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
22987 NULL, 1, OPTAB_DIRECT);
22988 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22989 if (mode == SImode)
22990 return reg;
22991 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
22992 NULL, 1, OPTAB_DIRECT);
22993 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22994 return reg;
22998 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
22999 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
23000 alignment from ALIGN to DESIRED_ALIGN. */
23001 static rtx
23002 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align)
23004 rtx promoted_val;
23006 if (TARGET_64BIT
23007 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
23008 promoted_val = promote_duplicated_reg (DImode, val);
23009 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
23010 promoted_val = promote_duplicated_reg (SImode, val);
23011 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
23012 promoted_val = promote_duplicated_reg (HImode, val);
23013 else
23014 promoted_val = val;
23016 return promoted_val;
23019 /* Expand string clear operation (bzero). Use i386 string operations when
23020 profitable. See expand_movmem comment for explanation of individual
23021 steps performed. */
23022 bool
23023 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
23024 rtx expected_align_exp, rtx expected_size_exp)
23026 rtx destreg;
23027 rtx label = NULL;
23028 rtx tmp;
23029 rtx jump_around_label = NULL;
23030 HOST_WIDE_INT align = 1;
23031 unsigned HOST_WIDE_INT count = 0;
23032 HOST_WIDE_INT expected_size = -1;
23033 int size_needed = 0, epilogue_size_needed;
23034 int desired_align = 0, align_bytes = 0;
23035 enum stringop_alg alg;
23036 rtx promoted_val = NULL;
23037 bool force_loopy_epilogue = false;
23038 int dynamic_check;
23039 bool need_zero_guard = false;
23040 bool noalign;
23042 if (CONST_INT_P (align_exp))
23043 align = INTVAL (align_exp);
23044 /* i386 can do misaligned access on reasonably increased cost. */
23045 if (CONST_INT_P (expected_align_exp)
23046 && INTVAL (expected_align_exp) > align)
23047 align = INTVAL (expected_align_exp);
23048 if (CONST_INT_P (count_exp))
23049 count = expected_size = INTVAL (count_exp);
23050 if (CONST_INT_P (expected_size_exp) && count == 0)
23051 expected_size = INTVAL (expected_size_exp);
23053 /* Make sure we don't need to care about overflow later on. */
23054 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
23055 return false;
23057 /* Step 0: Decide on preferred algorithm, desired alignment and
23058 size of chunks to be copied by main loop. */
23060 alg = decide_alg (count, expected_size, true, &dynamic_check, &noalign);
23061 desired_align = decide_alignment (align, alg, expected_size);
23063 if (!TARGET_ALIGN_STRINGOPS || noalign)
23064 align = desired_align;
23066 if (alg == libcall)
23067 return false;
23068 gcc_assert (alg != no_stringop);
23069 if (!count)
23070 count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp);
23071 destreg = copy_addr_to_reg (XEXP (dst, 0));
23072 switch (alg)
23074 case libcall:
23075 case no_stringop:
23076 gcc_unreachable ();
23077 case loop:
23078 need_zero_guard = true;
23079 size_needed = GET_MODE_SIZE (word_mode);
23080 break;
23081 case unrolled_loop:
23082 need_zero_guard = true;
23083 size_needed = GET_MODE_SIZE (word_mode) * 4;
23084 break;
23085 case rep_prefix_8_byte:
23086 size_needed = 8;
23087 break;
23088 case rep_prefix_4_byte:
23089 size_needed = 4;
23090 break;
23091 case rep_prefix_1_byte:
23092 size_needed = 1;
23093 break;
23094 case loop_1_byte:
23095 need_zero_guard = true;
23096 size_needed = 1;
23097 break;
23099 epilogue_size_needed = size_needed;
23101 /* Step 1: Prologue guard. */
23103 /* Alignment code needs count to be in register. */
23104 if (CONST_INT_P (count_exp) && desired_align > align)
23106 if (INTVAL (count_exp) > desired_align
23107 && INTVAL (count_exp) > size_needed)
23109 align_bytes
23110 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
23111 if (align_bytes <= 0)
23112 align_bytes = 0;
23113 else
23114 align_bytes = desired_align - align_bytes;
23116 if (align_bytes == 0)
23118 enum machine_mode mode = SImode;
23119 if (TARGET_64BIT && (count & ~0xffffffff))
23120 mode = DImode;
23121 count_exp = force_reg (mode, count_exp);
23124 /* Do the cheap promotion to allow better CSE across the
23125 main loop and epilogue (ie one load of the big constant in the
23126 front of all code. */
23127 if (CONST_INT_P (val_exp))
23128 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
23129 desired_align, align);
23130 /* Ensure that alignment prologue won't copy past end of block. */
23131 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
23133 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
23134 /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
23135 Make sure it is power of 2. */
23136 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
23138 /* To improve performance of small blocks, we jump around the VAL
23139 promoting mode. This mean that if the promoted VAL is not constant,
23140 we might not use it in the epilogue and have to use byte
23141 loop variant. */
23142 if (epilogue_size_needed > 2 && !promoted_val)
23143 force_loopy_epilogue = true;
23144 if (count)
23146 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
23148 /* If main algorithm works on QImode, no epilogue is needed.
23149 For small sizes just don't align anything. */
23150 if (size_needed == 1)
23151 desired_align = align;
23152 else
23153 goto epilogue;
23156 else
23158 label = gen_label_rtx ();
23159 emit_cmp_and_jump_insns (count_exp,
23160 GEN_INT (epilogue_size_needed),
23161 LTU, 0, counter_mode (count_exp), 1, label);
23162 if (expected_size == -1 || expected_size <= epilogue_size_needed)
23163 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23164 else
23165 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23168 if (dynamic_check != -1)
23170 rtx hot_label = gen_label_rtx ();
23171 jump_around_label = gen_label_rtx ();
23172 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
23173 LEU, 0, counter_mode (count_exp), 1, hot_label);
23174 predict_jump (REG_BR_PROB_BASE * 90 / 100);
23175 set_storage_via_libcall (dst, count_exp, val_exp, false);
23176 emit_jump (jump_around_label);
23177 emit_label (hot_label);
23180 /* Step 2: Alignment prologue. */
23182 /* Do the expensive promotion once we branched off the small blocks. */
23183 if (!promoted_val)
23184 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
23185 desired_align, align);
23186 gcc_assert (desired_align >= 1 && align >= 1);
23188 if (desired_align > align)
23190 if (align_bytes == 0)
23192 /* Except for the first move in epilogue, we no longer know
23193 constant offset in aliasing info. It don't seems to worth
23194 the pain to maintain it for the first move, so throw away
23195 the info early. */
23196 dst = change_address (dst, BLKmode, destreg);
23197 expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
23198 desired_align);
23200 else
23202 /* If we know how many bytes need to be stored before dst is
23203 sufficiently aligned, maintain aliasing info accurately. */
23204 dst = expand_constant_setmem_prologue (dst, destreg, promoted_val,
23205 desired_align, align_bytes);
23206 count_exp = plus_constant (counter_mode (count_exp),
23207 count_exp, -align_bytes);
23208 count -= align_bytes;
23210 if (need_zero_guard
23211 && (count < (unsigned HOST_WIDE_INT) size_needed
23212 || (align_bytes == 0
23213 && count < ((unsigned HOST_WIDE_INT) size_needed
23214 + desired_align - align))))
23216 /* It is possible that we copied enough so the main loop will not
23217 execute. */
23218 gcc_assert (size_needed > 1);
23219 if (label == NULL_RTX)
23220 label = gen_label_rtx ();
23221 emit_cmp_and_jump_insns (count_exp,
23222 GEN_INT (size_needed),
23223 LTU, 0, counter_mode (count_exp), 1, label);
23224 if (expected_size == -1
23225 || expected_size < (desired_align - align) / 2 + size_needed)
23226 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23227 else
23228 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23231 if (label && size_needed == 1)
23233 emit_label (label);
23234 LABEL_NUSES (label) = 1;
23235 label = NULL;
23236 promoted_val = val_exp;
23237 epilogue_size_needed = 1;
23239 else if (label == NULL_RTX)
23240 epilogue_size_needed = size_needed;
23242 /* Step 3: Main loop. */
23244 switch (alg)
23246 case libcall:
23247 case no_stringop:
23248 gcc_unreachable ();
23249 case loop_1_byte:
23250 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23251 count_exp, QImode, 1, expected_size);
23252 break;
23253 case loop:
23254 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23255 count_exp, word_mode, 1, expected_size);
23256 break;
23257 case unrolled_loop:
23258 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23259 count_exp, word_mode, 4, expected_size);
23260 break;
23261 case rep_prefix_8_byte:
23262 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23263 DImode, val_exp);
23264 break;
23265 case rep_prefix_4_byte:
23266 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23267 SImode, val_exp);
23268 break;
23269 case rep_prefix_1_byte:
23270 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23271 QImode, val_exp);
23272 break;
23274 /* Adjust properly the offset of src and dest memory for aliasing. */
23275 if (CONST_INT_P (count_exp))
23276 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
23277 (count / size_needed) * size_needed);
23278 else
23279 dst = change_address (dst, BLKmode, destreg);
23281 /* Step 4: Epilogue to copy the remaining bytes. */
23283 if (label)
23285 /* When the main loop is done, COUNT_EXP might hold original count,
23286 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
23287 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
23288 bytes. Compensate if needed. */
23290 if (size_needed < epilogue_size_needed)
23292 tmp =
23293 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
23294 GEN_INT (size_needed - 1), count_exp, 1,
23295 OPTAB_DIRECT);
23296 if (tmp != count_exp)
23297 emit_move_insn (count_exp, tmp);
23299 emit_label (label);
23300 LABEL_NUSES (label) = 1;
23302 epilogue:
23303 if (count_exp != const0_rtx && epilogue_size_needed > 1)
23305 if (force_loopy_epilogue)
23306 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
23307 epilogue_size_needed);
23308 else
23309 expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
23310 epilogue_size_needed);
23312 if (jump_around_label)
23313 emit_label (jump_around_label);
23314 return true;
23317 /* Expand the appropriate insns for doing strlen if not just doing
23318 repnz; scasb
23320 out = result, initialized with the start address
23321 align_rtx = alignment of the address.
23322 scratch = scratch register, initialized with the startaddress when
23323 not aligned, otherwise undefined
23325 This is just the body. It needs the initializations mentioned above and
23326 some address computing at the end. These things are done in i386.md. */
23328 static void
23329 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
23331 int align;
23332 rtx tmp;
23333 rtx align_2_label = NULL_RTX;
23334 rtx align_3_label = NULL_RTX;
23335 rtx align_4_label = gen_label_rtx ();
23336 rtx end_0_label = gen_label_rtx ();
23337 rtx mem;
23338 rtx tmpreg = gen_reg_rtx (SImode);
23339 rtx scratch = gen_reg_rtx (SImode);
23340 rtx cmp;
23342 align = 0;
23343 if (CONST_INT_P (align_rtx))
23344 align = INTVAL (align_rtx);
23346 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
23348 /* Is there a known alignment and is it less than 4? */
23349 if (align < 4)
23351 rtx scratch1 = gen_reg_rtx (Pmode);
23352 emit_move_insn (scratch1, out);
23353 /* Is there a known alignment and is it not 2? */
23354 if (align != 2)
23356 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
23357 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
23359 /* Leave just the 3 lower bits. */
23360 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
23361 NULL_RTX, 0, OPTAB_WIDEN);
23363 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
23364 Pmode, 1, align_4_label);
23365 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
23366 Pmode, 1, align_2_label);
23367 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
23368 Pmode, 1, align_3_label);
23370 else
23372 /* Since the alignment is 2, we have to check 2 or 0 bytes;
23373 check if is aligned to 4 - byte. */
23375 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
23376 NULL_RTX, 0, OPTAB_WIDEN);
23378 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
23379 Pmode, 1, align_4_label);
23382 mem = change_address (src, QImode, out);
23384 /* Now compare the bytes. */
23386 /* Compare the first n unaligned byte on a byte per byte basis. */
23387 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
23388 QImode, 1, end_0_label);
23390 /* Increment the address. */
23391 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23393 /* Not needed with an alignment of 2 */
23394 if (align != 2)
23396 emit_label (align_2_label);
23398 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23399 end_0_label);
23401 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23403 emit_label (align_3_label);
23406 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23407 end_0_label);
23409 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23412 /* Generate loop to check 4 bytes at a time. It is not a good idea to
23413 align this loop. It gives only huge programs, but does not help to
23414 speed up. */
23415 emit_label (align_4_label);
23417 mem = change_address (src, SImode, out);
23418 emit_move_insn (scratch, mem);
23419 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
23421 /* This formula yields a nonzero result iff one of the bytes is zero.
23422 This saves three branches inside loop and many cycles. */
23424 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
23425 emit_insn (gen_one_cmplsi2 (scratch, scratch));
23426 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
23427 emit_insn (gen_andsi3 (tmpreg, tmpreg,
23428 gen_int_mode (0x80808080, SImode)));
23429 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
23430 align_4_label);
23432 if (TARGET_CMOVE)
23434 rtx reg = gen_reg_rtx (SImode);
23435 rtx reg2 = gen_reg_rtx (Pmode);
23436 emit_move_insn (reg, tmpreg);
23437 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
23439 /* If zero is not in the first two bytes, move two bytes forward. */
23440 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23441 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23442 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23443 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
23444 gen_rtx_IF_THEN_ELSE (SImode, tmp,
23445 reg,
23446 tmpreg)));
23447 /* Emit lea manually to avoid clobbering of flags. */
23448 emit_insn (gen_rtx_SET (SImode, reg2,
23449 gen_rtx_PLUS (Pmode, out, const2_rtx)));
23451 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23452 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23453 emit_insn (gen_rtx_SET (VOIDmode, out,
23454 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
23455 reg2,
23456 out)));
23458 else
23460 rtx end_2_label = gen_label_rtx ();
23461 /* Is zero in the first two bytes? */
23463 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23464 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23465 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
23466 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23467 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
23468 pc_rtx);
23469 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
23470 JUMP_LABEL (tmp) = end_2_label;
23472 /* Not in the first two. Move two bytes forward. */
23473 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
23474 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
23476 emit_label (end_2_label);
23480 /* Avoid branch in fixing the byte. */
23481 tmpreg = gen_lowpart (QImode, tmpreg);
23482 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
23483 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
23484 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
23485 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
23487 emit_label (end_0_label);
23490 /* Expand strlen. */
23492 bool
23493 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
23495 rtx addr, scratch1, scratch2, scratch3, scratch4;
23497 /* The generic case of strlen expander is long. Avoid it's
23498 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
23500 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23501 && !TARGET_INLINE_ALL_STRINGOPS
23502 && !optimize_insn_for_size_p ()
23503 && (!CONST_INT_P (align) || INTVAL (align) < 4))
23504 return false;
23506 addr = force_reg (Pmode, XEXP (src, 0));
23507 scratch1 = gen_reg_rtx (Pmode);
23509 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23510 && !optimize_insn_for_size_p ())
23512 /* Well it seems that some optimizer does not combine a call like
23513 foo(strlen(bar), strlen(bar));
23514 when the move and the subtraction is done here. It does calculate
23515 the length just once when these instructions are done inside of
23516 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
23517 often used and I use one fewer register for the lifetime of
23518 output_strlen_unroll() this is better. */
23520 emit_move_insn (out, addr);
23522 ix86_expand_strlensi_unroll_1 (out, src, align);
23524 /* strlensi_unroll_1 returns the address of the zero at the end of
23525 the string, like memchr(), so compute the length by subtracting
23526 the start address. */
23527 emit_insn (ix86_gen_sub3 (out, out, addr));
23529 else
23531 rtx unspec;
23533 /* Can't use this if the user has appropriated eax, ecx, or edi. */
23534 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
23535 return false;
23537 scratch2 = gen_reg_rtx (Pmode);
23538 scratch3 = gen_reg_rtx (Pmode);
23539 scratch4 = force_reg (Pmode, constm1_rtx);
23541 emit_move_insn (scratch3, addr);
23542 eoschar = force_reg (QImode, eoschar);
23544 src = replace_equiv_address_nv (src, scratch3);
23546 /* If .md starts supporting :P, this can be done in .md. */
23547 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
23548 scratch4), UNSPEC_SCAS);
23549 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
23550 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
23551 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
23553 return true;
23556 /* For given symbol (function) construct code to compute address of it's PLT
23557 entry in large x86-64 PIC model. */
23558 static rtx
23559 construct_plt_address (rtx symbol)
23561 rtx tmp, unspec;
23563 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
23564 gcc_assert (ix86_cmodel == CM_LARGE_PIC);
23565 gcc_assert (Pmode == DImode);
23567 tmp = gen_reg_rtx (Pmode);
23568 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
23570 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
23571 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
23572 return tmp;
23576 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
23577 rtx callarg2,
23578 rtx pop, bool sibcall)
23580 /* We need to represent that SI and DI registers are clobbered
23581 by SYSV calls. */
23582 static int clobbered_registers[] = {
23583 XMM6_REG, XMM7_REG, XMM8_REG,
23584 XMM9_REG, XMM10_REG, XMM11_REG,
23585 XMM12_REG, XMM13_REG, XMM14_REG,
23586 XMM15_REG, SI_REG, DI_REG
23588 rtx vec[ARRAY_SIZE (clobbered_registers) + 3];
23589 rtx use = NULL, call;
23590 unsigned int vec_len;
23592 if (pop == const0_rtx)
23593 pop = NULL;
23594 gcc_assert (!TARGET_64BIT || !pop);
23596 if (TARGET_MACHO && !TARGET_64BIT)
23598 #if TARGET_MACHO
23599 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
23600 fnaddr = machopic_indirect_call_target (fnaddr);
23601 #endif
23603 else
23605 /* Static functions and indirect calls don't need the pic register. */
23606 if (flag_pic && (!TARGET_64BIT || ix86_cmodel == CM_LARGE_PIC)
23607 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23608 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
23609 use_reg (&use, pic_offset_table_rtx);
23612 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
23614 rtx al = gen_rtx_REG (QImode, AX_REG);
23615 emit_move_insn (al, callarg2);
23616 use_reg (&use, al);
23619 if (ix86_cmodel == CM_LARGE_PIC
23620 && MEM_P (fnaddr)
23621 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23622 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
23623 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
23624 else if (sibcall
23625 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
23626 : !call_insn_operand (XEXP (fnaddr, 0), word_mode))
23628 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
23629 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
23632 vec_len = 0;
23633 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
23634 if (retval)
23635 call = gen_rtx_SET (VOIDmode, retval, call);
23636 vec[vec_len++] = call;
23638 if (pop)
23640 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
23641 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
23642 vec[vec_len++] = pop;
23645 if (TARGET_64BIT_MS_ABI
23646 && (!callarg2 || INTVAL (callarg2) != -2))
23648 unsigned i;
23650 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
23651 UNSPEC_MS_TO_SYSV_CALL);
23653 for (i = 0; i < ARRAY_SIZE (clobbered_registers); i++)
23654 vec[vec_len++]
23655 = gen_rtx_CLOBBER (VOIDmode,
23656 gen_rtx_REG (SSE_REGNO_P (clobbered_registers[i])
23657 ? TImode : DImode,
23658 clobbered_registers[i]));
23661 if (vec_len > 1)
23662 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
23663 call = emit_call_insn (call);
23664 if (use)
23665 CALL_INSN_FUNCTION_USAGE (call) = use;
23667 return call;
23670 /* Output the assembly for a call instruction. */
23672 const char *
23673 ix86_output_call_insn (rtx insn, rtx call_op)
23675 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
23676 bool seh_nop_p = false;
23677 const char *xasm;
23679 if (SIBLING_CALL_P (insn))
23681 if (direct_p)
23682 xasm = "jmp\t%P0";
23683 /* SEH epilogue detection requires the indirect branch case
23684 to include REX.W. */
23685 else if (TARGET_SEH)
23686 xasm = "rex.W jmp %A0";
23687 else
23688 xasm = "jmp\t%A0";
23690 output_asm_insn (xasm, &call_op);
23691 return "";
23694 /* SEH unwinding can require an extra nop to be emitted in several
23695 circumstances. Determine if we have one of those. */
23696 if (TARGET_SEH)
23698 rtx i;
23700 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
23702 /* If we get to another real insn, we don't need the nop. */
23703 if (INSN_P (i))
23704 break;
23706 /* If we get to the epilogue note, prevent a catch region from
23707 being adjacent to the standard epilogue sequence. If non-
23708 call-exceptions, we'll have done this during epilogue emission. */
23709 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
23710 && !flag_non_call_exceptions
23711 && !can_throw_internal (insn))
23713 seh_nop_p = true;
23714 break;
23718 /* If we didn't find a real insn following the call, prevent the
23719 unwinder from looking into the next function. */
23720 if (i == NULL)
23721 seh_nop_p = true;
23724 if (direct_p)
23725 xasm = "call\t%P0";
23726 else
23727 xasm = "call\t%A0";
23729 output_asm_insn (xasm, &call_op);
23731 if (seh_nop_p)
23732 return "nop";
23734 return "";
23737 /* Clear stack slot assignments remembered from previous functions.
23738 This is called from INIT_EXPANDERS once before RTL is emitted for each
23739 function. */
23741 static struct machine_function *
23742 ix86_init_machine_status (void)
23744 struct machine_function *f;
23746 f = ggc_alloc_cleared_machine_function ();
23747 f->use_fast_prologue_epilogue_nregs = -1;
23748 f->call_abi = ix86_abi;
23750 return f;
23753 /* Return a MEM corresponding to a stack slot with mode MODE.
23754 Allocate a new slot if necessary.
23756 The RTL for a function can have several slots available: N is
23757 which slot to use. */
23760 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
23762 struct stack_local_entry *s;
23764 gcc_assert (n < MAX_386_STACK_LOCALS);
23766 for (s = ix86_stack_locals; s; s = s->next)
23767 if (s->mode == mode && s->n == n)
23768 return validize_mem (copy_rtx (s->rtl));
23770 s = ggc_alloc_stack_local_entry ();
23771 s->n = n;
23772 s->mode = mode;
23773 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
23775 s->next = ix86_stack_locals;
23776 ix86_stack_locals = s;
23777 return validize_mem (s->rtl);
23780 static void
23781 ix86_instantiate_decls (void)
23783 struct stack_local_entry *s;
23785 for (s = ix86_stack_locals; s; s = s->next)
23786 if (s->rtl != NULL_RTX)
23787 instantiate_decl_rtl (s->rtl);
23790 /* Calculate the length of the memory address in the instruction encoding.
23791 Includes addr32 prefix, does not include the one-byte modrm, opcode,
23792 or other prefixes. We never generate addr32 prefix for LEA insn. */
23795 memory_address_length (rtx addr, bool lea)
23797 struct ix86_address parts;
23798 rtx base, index, disp;
23799 int len;
23800 int ok;
23802 if (GET_CODE (addr) == PRE_DEC
23803 || GET_CODE (addr) == POST_INC
23804 || GET_CODE (addr) == PRE_MODIFY
23805 || GET_CODE (addr) == POST_MODIFY)
23806 return 0;
23808 ok = ix86_decompose_address (addr, &parts);
23809 gcc_assert (ok);
23811 len = (parts.seg == SEG_DEFAULT) ? 0 : 1;
23813 /* If this is not LEA instruction, add the length of addr32 prefix. */
23814 if (TARGET_64BIT && !lea
23815 && (SImode_address_operand (addr, VOIDmode)
23816 || (parts.base && GET_MODE (parts.base) == SImode)
23817 || (parts.index && GET_MODE (parts.index) == SImode)))
23818 len++;
23820 base = parts.base;
23821 index = parts.index;
23822 disp = parts.disp;
23824 if (base && GET_CODE (base) == SUBREG)
23825 base = SUBREG_REG (base);
23826 if (index && GET_CODE (index) == SUBREG)
23827 index = SUBREG_REG (index);
23829 gcc_assert (base == NULL_RTX || REG_P (base));
23830 gcc_assert (index == NULL_RTX || REG_P (index));
23832 /* Rule of thumb:
23833 - esp as the base always wants an index,
23834 - ebp as the base always wants a displacement,
23835 - r12 as the base always wants an index,
23836 - r13 as the base always wants a displacement. */
23838 /* Register Indirect. */
23839 if (base && !index && !disp)
23841 /* esp (for its index) and ebp (for its displacement) need
23842 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
23843 code. */
23844 if (base == arg_pointer_rtx
23845 || base == frame_pointer_rtx
23846 || REGNO (base) == SP_REG
23847 || REGNO (base) == BP_REG
23848 || REGNO (base) == R12_REG
23849 || REGNO (base) == R13_REG)
23850 len++;
23853 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
23854 is not disp32, but disp32(%rip), so for disp32
23855 SIB byte is needed, unless print_operand_address
23856 optimizes it into disp32(%rip) or (%rip) is implied
23857 by UNSPEC. */
23858 else if (disp && !base && !index)
23860 len += 4;
23861 if (TARGET_64BIT)
23863 rtx symbol = disp;
23865 if (GET_CODE (disp) == CONST)
23866 symbol = XEXP (disp, 0);
23867 if (GET_CODE (symbol) == PLUS
23868 && CONST_INT_P (XEXP (symbol, 1)))
23869 symbol = XEXP (symbol, 0);
23871 if (GET_CODE (symbol) != LABEL_REF
23872 && (GET_CODE (symbol) != SYMBOL_REF
23873 || SYMBOL_REF_TLS_MODEL (symbol) != 0)
23874 && (GET_CODE (symbol) != UNSPEC
23875 || (XINT (symbol, 1) != UNSPEC_GOTPCREL
23876 && XINT (symbol, 1) != UNSPEC_PCREL
23877 && XINT (symbol, 1) != UNSPEC_GOTNTPOFF)))
23878 len++;
23881 else
23883 /* Find the length of the displacement constant. */
23884 if (disp)
23886 if (base && satisfies_constraint_K (disp))
23887 len += 1;
23888 else
23889 len += 4;
23891 /* ebp always wants a displacement. Similarly r13. */
23892 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
23893 len++;
23895 /* An index requires the two-byte modrm form.... */
23896 if (index
23897 /* ...like esp (or r12), which always wants an index. */
23898 || base == arg_pointer_rtx
23899 || base == frame_pointer_rtx
23900 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
23901 len++;
23904 return len;
23907 /* Compute default value for "length_immediate" attribute. When SHORTFORM
23908 is set, expect that insn have 8bit immediate alternative. */
23910 ix86_attr_length_immediate_default (rtx insn, bool shortform)
23912 int len = 0;
23913 int i;
23914 extract_insn_cached (insn);
23915 for (i = recog_data.n_operands - 1; i >= 0; --i)
23916 if (CONSTANT_P (recog_data.operand[i]))
23918 enum attr_mode mode = get_attr_mode (insn);
23920 gcc_assert (!len);
23921 if (shortform && CONST_INT_P (recog_data.operand[i]))
23923 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
23924 switch (mode)
23926 case MODE_QI:
23927 len = 1;
23928 continue;
23929 case MODE_HI:
23930 ival = trunc_int_for_mode (ival, HImode);
23931 break;
23932 case MODE_SI:
23933 ival = trunc_int_for_mode (ival, SImode);
23934 break;
23935 default:
23936 break;
23938 if (IN_RANGE (ival, -128, 127))
23940 len = 1;
23941 continue;
23944 switch (mode)
23946 case MODE_QI:
23947 len = 1;
23948 break;
23949 case MODE_HI:
23950 len = 2;
23951 break;
23952 case MODE_SI:
23953 len = 4;
23954 break;
23955 /* Immediates for DImode instructions are encoded
23956 as 32bit sign extended values. */
23957 case MODE_DI:
23958 len = 4;
23959 break;
23960 default:
23961 fatal_insn ("unknown insn mode", insn);
23964 return len;
23967 /* Compute default value for "length_address" attribute. */
23969 ix86_attr_length_address_default (rtx insn)
23971 int i;
23973 if (get_attr_type (insn) == TYPE_LEA)
23975 rtx set = PATTERN (insn), addr;
23977 if (GET_CODE (set) == PARALLEL)
23978 set = XVECEXP (set, 0, 0);
23980 gcc_assert (GET_CODE (set) == SET);
23982 addr = SET_SRC (set);
23984 return memory_address_length (addr, true);
23987 extract_insn_cached (insn);
23988 for (i = recog_data.n_operands - 1; i >= 0; --i)
23989 if (MEM_P (recog_data.operand[i]))
23991 constrain_operands_cached (reload_completed);
23992 if (which_alternative != -1)
23994 const char *constraints = recog_data.constraints[i];
23995 int alt = which_alternative;
23997 while (*constraints == '=' || *constraints == '+')
23998 constraints++;
23999 while (alt-- > 0)
24000 while (*constraints++ != ',')
24002 /* Skip ignored operands. */
24003 if (*constraints == 'X')
24004 continue;
24006 return memory_address_length (XEXP (recog_data.operand[i], 0), false);
24008 return 0;
24011 /* Compute default value for "length_vex" attribute. It includes
24012 2 or 3 byte VEX prefix and 1 opcode byte. */
24015 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
24017 int i;
24019 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
24020 byte VEX prefix. */
24021 if (!has_0f_opcode || has_vex_w)
24022 return 3 + 1;
24024 /* We can always use 2 byte VEX prefix in 32bit. */
24025 if (!TARGET_64BIT)
24026 return 2 + 1;
24028 extract_insn_cached (insn);
24030 for (i = recog_data.n_operands - 1; i >= 0; --i)
24031 if (REG_P (recog_data.operand[i]))
24033 /* REX.W bit uses 3 byte VEX prefix. */
24034 if (GET_MODE (recog_data.operand[i]) == DImode
24035 && GENERAL_REG_P (recog_data.operand[i]))
24036 return 3 + 1;
24038 else
24040 /* REX.X or REX.B bits use 3 byte VEX prefix. */
24041 if (MEM_P (recog_data.operand[i])
24042 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
24043 return 3 + 1;
24046 return 2 + 1;
24049 /* Return the maximum number of instructions a cpu can issue. */
24051 static int
24052 ix86_issue_rate (void)
24054 switch (ix86_tune)
24056 case PROCESSOR_PENTIUM:
24057 case PROCESSOR_ATOM:
24058 case PROCESSOR_K6:
24059 case PROCESSOR_BTVER2:
24060 return 2;
24062 case PROCESSOR_PENTIUMPRO:
24063 case PROCESSOR_PENTIUM4:
24064 case PROCESSOR_CORE2:
24065 case PROCESSOR_COREI7:
24066 case PROCESSOR_ATHLON:
24067 case PROCESSOR_K8:
24068 case PROCESSOR_AMDFAM10:
24069 case PROCESSOR_NOCONA:
24070 case PROCESSOR_GENERIC32:
24071 case PROCESSOR_GENERIC64:
24072 case PROCESSOR_BDVER1:
24073 case PROCESSOR_BDVER2:
24074 case PROCESSOR_BDVER3:
24075 case PROCESSOR_BTVER1:
24076 return 3;
24078 default:
24079 return 1;
24083 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
24084 by DEP_INSN and nothing set by DEP_INSN. */
24086 static bool
24087 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
24089 rtx set, set2;
24091 /* Simplify the test for uninteresting insns. */
24092 if (insn_type != TYPE_SETCC
24093 && insn_type != TYPE_ICMOV
24094 && insn_type != TYPE_FCMOV
24095 && insn_type != TYPE_IBR)
24096 return false;
24098 if ((set = single_set (dep_insn)) != 0)
24100 set = SET_DEST (set);
24101 set2 = NULL_RTX;
24103 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
24104 && XVECLEN (PATTERN (dep_insn), 0) == 2
24105 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
24106 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
24108 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
24109 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
24111 else
24112 return false;
24114 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
24115 return false;
24117 /* This test is true if the dependent insn reads the flags but
24118 not any other potentially set register. */
24119 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
24120 return false;
24122 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
24123 return false;
24125 return true;
24128 /* Return true iff USE_INSN has a memory address with operands set by
24129 SET_INSN. */
24131 bool
24132 ix86_agi_dependent (rtx set_insn, rtx use_insn)
24134 int i;
24135 extract_insn_cached (use_insn);
24136 for (i = recog_data.n_operands - 1; i >= 0; --i)
24137 if (MEM_P (recog_data.operand[i]))
24139 rtx addr = XEXP (recog_data.operand[i], 0);
24140 return modified_in_p (addr, set_insn) != 0;
24142 return false;
24145 static int
24146 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
24148 enum attr_type insn_type, dep_insn_type;
24149 enum attr_memory memory;
24150 rtx set, set2;
24151 int dep_insn_code_number;
24153 /* Anti and output dependencies have zero cost on all CPUs. */
24154 if (REG_NOTE_KIND (link) != 0)
24155 return 0;
24157 dep_insn_code_number = recog_memoized (dep_insn);
24159 /* If we can't recognize the insns, we can't really do anything. */
24160 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
24161 return cost;
24163 insn_type = get_attr_type (insn);
24164 dep_insn_type = get_attr_type (dep_insn);
24166 switch (ix86_tune)
24168 case PROCESSOR_PENTIUM:
24169 /* Address Generation Interlock adds a cycle of latency. */
24170 if (insn_type == TYPE_LEA)
24172 rtx addr = PATTERN (insn);
24174 if (GET_CODE (addr) == PARALLEL)
24175 addr = XVECEXP (addr, 0, 0);
24177 gcc_assert (GET_CODE (addr) == SET);
24179 addr = SET_SRC (addr);
24180 if (modified_in_p (addr, dep_insn))
24181 cost += 1;
24183 else if (ix86_agi_dependent (dep_insn, insn))
24184 cost += 1;
24186 /* ??? Compares pair with jump/setcc. */
24187 if (ix86_flags_dependent (insn, dep_insn, insn_type))
24188 cost = 0;
24190 /* Floating point stores require value to be ready one cycle earlier. */
24191 if (insn_type == TYPE_FMOV
24192 && get_attr_memory (insn) == MEMORY_STORE
24193 && !ix86_agi_dependent (dep_insn, insn))
24194 cost += 1;
24195 break;
24197 case PROCESSOR_PENTIUMPRO:
24198 memory = get_attr_memory (insn);
24200 /* INT->FP conversion is expensive. */
24201 if (get_attr_fp_int_src (dep_insn))
24202 cost += 5;
24204 /* There is one cycle extra latency between an FP op and a store. */
24205 if (insn_type == TYPE_FMOV
24206 && (set = single_set (dep_insn)) != NULL_RTX
24207 && (set2 = single_set (insn)) != NULL_RTX
24208 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
24209 && MEM_P (SET_DEST (set2)))
24210 cost += 1;
24212 /* Show ability of reorder buffer to hide latency of load by executing
24213 in parallel with previous instruction in case
24214 previous instruction is not needed to compute the address. */
24215 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24216 && !ix86_agi_dependent (dep_insn, insn))
24218 /* Claim moves to take one cycle, as core can issue one load
24219 at time and the next load can start cycle later. */
24220 if (dep_insn_type == TYPE_IMOV
24221 || dep_insn_type == TYPE_FMOV)
24222 cost = 1;
24223 else if (cost > 1)
24224 cost--;
24226 break;
24228 case PROCESSOR_K6:
24229 memory = get_attr_memory (insn);
24231 /* The esp dependency is resolved before the instruction is really
24232 finished. */
24233 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
24234 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
24235 return 1;
24237 /* INT->FP conversion is expensive. */
24238 if (get_attr_fp_int_src (dep_insn))
24239 cost += 5;
24241 /* Show ability of reorder buffer to hide latency of load by executing
24242 in parallel with previous instruction in case
24243 previous instruction is not needed to compute the address. */
24244 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24245 && !ix86_agi_dependent (dep_insn, insn))
24247 /* Claim moves to take one cycle, as core can issue one load
24248 at time and the next load can start cycle later. */
24249 if (dep_insn_type == TYPE_IMOV
24250 || dep_insn_type == TYPE_FMOV)
24251 cost = 1;
24252 else if (cost > 2)
24253 cost -= 2;
24254 else
24255 cost = 1;
24257 break;
24259 case PROCESSOR_ATHLON:
24260 case PROCESSOR_K8:
24261 case PROCESSOR_AMDFAM10:
24262 case PROCESSOR_BDVER1:
24263 case PROCESSOR_BDVER2:
24264 case PROCESSOR_BDVER3:
24265 case PROCESSOR_BTVER1:
24266 case PROCESSOR_BTVER2:
24267 case PROCESSOR_ATOM:
24268 case PROCESSOR_GENERIC32:
24269 case PROCESSOR_GENERIC64:
24270 memory = get_attr_memory (insn);
24272 /* Show ability of reorder buffer to hide latency of load by executing
24273 in parallel with previous instruction in case
24274 previous instruction is not needed to compute the address. */
24275 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24276 && !ix86_agi_dependent (dep_insn, insn))
24278 enum attr_unit unit = get_attr_unit (insn);
24279 int loadcost = 3;
24281 /* Because of the difference between the length of integer and
24282 floating unit pipeline preparation stages, the memory operands
24283 for floating point are cheaper.
24285 ??? For Athlon it the difference is most probably 2. */
24286 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
24287 loadcost = 3;
24288 else
24289 loadcost = TARGET_ATHLON ? 2 : 0;
24291 if (cost >= loadcost)
24292 cost -= loadcost;
24293 else
24294 cost = 0;
24297 default:
24298 break;
24301 return cost;
24304 /* How many alternative schedules to try. This should be as wide as the
24305 scheduling freedom in the DFA, but no wider. Making this value too
24306 large results extra work for the scheduler. */
24308 static int
24309 ia32_multipass_dfa_lookahead (void)
24311 switch (ix86_tune)
24313 case PROCESSOR_PENTIUM:
24314 return 2;
24316 case PROCESSOR_PENTIUMPRO:
24317 case PROCESSOR_K6:
24318 return 1;
24320 case PROCESSOR_CORE2:
24321 case PROCESSOR_COREI7:
24322 case PROCESSOR_ATOM:
24323 /* Generally, we want haifa-sched:max_issue() to look ahead as far
24324 as many instructions can be executed on a cycle, i.e.,
24325 issue_rate. I wonder why tuning for many CPUs does not do this. */
24326 if (reload_completed)
24327 return ix86_issue_rate ();
24328 /* Don't use lookahead for pre-reload schedule to save compile time. */
24329 return 0;
24331 default:
24332 return 0;
24336 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
24337 execution. It is applied if
24338 (1) IMUL instruction is on the top of list;
24339 (2) There exists the only producer of independent IMUL instruction in
24340 ready list;
24341 (3) Put found producer on the top of ready list.
24342 Returns issue rate. */
24344 static int
24345 ix86_sched_reorder(FILE *dump, int sched_verbose, rtx *ready, int *pn_ready,
24346 int clock_var ATTRIBUTE_UNUSED)
24348 static int issue_rate = -1;
24349 int n_ready = *pn_ready;
24350 rtx insn, insn1, insn2;
24351 int i;
24352 sd_iterator_def sd_it;
24353 dep_t dep;
24354 int index = -1;
24356 /* Set up issue rate. */
24357 issue_rate = ix86_issue_rate();
24359 /* Do reodering for Atom only. */
24360 if (ix86_tune != PROCESSOR_ATOM)
24361 return issue_rate;
24362 /* Do not perform ready list reodering for pre-reload schedule pass. */
24363 if (!reload_completed)
24364 return issue_rate;
24365 /* Nothing to do if ready list contains only 1 instruction. */
24366 if (n_ready <= 1)
24367 return issue_rate;
24369 /* Check that IMUL instruction is on the top of ready list. */
24370 insn = ready[n_ready - 1];
24371 if (!NONDEBUG_INSN_P (insn))
24372 return issue_rate;
24373 insn = PATTERN (insn);
24374 if (GET_CODE (insn) == PARALLEL)
24375 insn = XVECEXP (insn, 0, 0);
24376 if (GET_CODE (insn) != SET)
24377 return issue_rate;
24378 if (!(GET_CODE (SET_SRC (insn)) == MULT
24379 && GET_MODE (SET_SRC (insn)) == SImode))
24380 return issue_rate;
24382 /* Search for producer of independent IMUL instruction. */
24383 for (i = n_ready - 2; i>= 0; i--)
24385 insn = ready[i];
24386 if (!NONDEBUG_INSN_P (insn))
24387 continue;
24388 /* Skip IMUL instruction. */
24389 insn2 = PATTERN (insn);
24390 if (GET_CODE (insn2) == PARALLEL)
24391 insn2 = XVECEXP (insn2, 0, 0);
24392 if (GET_CODE (insn2) == SET
24393 && GET_CODE (SET_SRC (insn2)) == MULT
24394 && GET_MODE (SET_SRC (insn2)) == SImode)
24395 continue;
24397 FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
24399 rtx con;
24400 con = DEP_CON (dep);
24401 if (!NONDEBUG_INSN_P (con))
24402 continue;
24403 insn1 = PATTERN (con);
24404 if (GET_CODE (insn1) == PARALLEL)
24405 insn1 = XVECEXP (insn1, 0, 0);
24407 if (GET_CODE (insn1) == SET
24408 && GET_CODE (SET_SRC (insn1)) == MULT
24409 && GET_MODE (SET_SRC (insn1)) == SImode)
24411 sd_iterator_def sd_it1;
24412 dep_t dep1;
24413 /* Check if there is no other dependee for IMUL. */
24414 index = i;
24415 FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
24417 rtx pro;
24418 pro = DEP_PRO (dep1);
24419 if (!NONDEBUG_INSN_P (pro))
24420 continue;
24421 if (pro != insn)
24422 index = -1;
24424 if (index >= 0)
24425 break;
24428 if (index >= 0)
24429 break;
24431 if (index < 0)
24432 return issue_rate; /* Didn't find IMUL producer. */
24434 if (sched_verbose > 1)
24435 fprintf(dump, ";;\tatom sched_reorder: swap %d and %d insns\n",
24436 INSN_UID (ready[index]), INSN_UID (ready[n_ready - 1]));
24438 /* Put IMUL producer (ready[index]) at the top of ready list. */
24439 insn1= ready[index];
24440 for (i = index; i < n_ready - 1; i++)
24441 ready[i] = ready[i + 1];
24442 ready[n_ready - 1] = insn1;
24444 return issue_rate;
24447 static bool
24448 ix86_class_likely_spilled_p (reg_class_t);
24450 /* Returns true if lhs of insn is HW function argument register and set up
24451 is_spilled to true if it is likely spilled HW register. */
24452 static bool
24453 insn_is_function_arg (rtx insn, bool* is_spilled)
24455 rtx dst;
24457 if (!NONDEBUG_INSN_P (insn))
24458 return false;
24459 /* Call instructions are not movable, ignore it. */
24460 if (CALL_P (insn))
24461 return false;
24462 insn = PATTERN (insn);
24463 if (GET_CODE (insn) == PARALLEL)
24464 insn = XVECEXP (insn, 0, 0);
24465 if (GET_CODE (insn) != SET)
24466 return false;
24467 dst = SET_DEST (insn);
24468 if (REG_P (dst) && HARD_REGISTER_P (dst)
24469 && ix86_function_arg_regno_p (REGNO (dst)))
24471 /* Is it likely spilled HW register? */
24472 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
24473 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
24474 *is_spilled = true;
24475 return true;
24477 return false;
24480 /* Add output dependencies for chain of function adjacent arguments if only
24481 there is a move to likely spilled HW register. Return first argument
24482 if at least one dependence was added or NULL otherwise. */
24483 static rtx
24484 add_parameter_dependencies (rtx call, rtx head)
24486 rtx insn;
24487 rtx last = call;
24488 rtx first_arg = NULL;
24489 bool is_spilled = false;
24491 head = PREV_INSN (head);
24493 /* Find nearest to call argument passing instruction. */
24494 while (true)
24496 last = PREV_INSN (last);
24497 if (last == head)
24498 return NULL;
24499 if (!NONDEBUG_INSN_P (last))
24500 continue;
24501 if (insn_is_function_arg (last, &is_spilled))
24502 break;
24503 return NULL;
24506 first_arg = last;
24507 while (true)
24509 insn = PREV_INSN (last);
24510 if (!INSN_P (insn))
24511 break;
24512 if (insn == head)
24513 break;
24514 if (!NONDEBUG_INSN_P (insn))
24516 last = insn;
24517 continue;
24519 if (insn_is_function_arg (insn, &is_spilled))
24521 /* Add output depdendence between two function arguments if chain
24522 of output arguments contains likely spilled HW registers. */
24523 if (is_spilled)
24524 add_dependence (last, insn, REG_DEP_OUTPUT);
24525 first_arg = last = insn;
24527 else
24528 break;
24530 if (!is_spilled)
24531 return NULL;
24532 return first_arg;
24535 /* Add output or anti dependency from insn to first_arg to restrict its code
24536 motion. */
24537 static void
24538 avoid_func_arg_motion (rtx first_arg, rtx insn)
24540 rtx set;
24541 rtx tmp;
24543 set = single_set (insn);
24544 if (!set)
24545 return;
24546 tmp = SET_DEST (set);
24547 if (REG_P (tmp))
24549 /* Add output dependency to the first function argument. */
24550 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
24551 return;
24553 /* Add anti dependency. */
24554 add_dependence (first_arg, insn, REG_DEP_ANTI);
24557 /* Avoid cross block motion of function argument through adding dependency
24558 from the first non-jump instruction in bb. */
24559 static void
24560 add_dependee_for_func_arg (rtx arg, basic_block bb)
24562 rtx insn = BB_END (bb);
24564 while (insn)
24566 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
24568 rtx set = single_set (insn);
24569 if (set)
24571 avoid_func_arg_motion (arg, insn);
24572 return;
24575 if (insn == BB_HEAD (bb))
24576 return;
24577 insn = PREV_INSN (insn);
24581 /* Hook for pre-reload schedule - avoid motion of function arguments
24582 passed in likely spilled HW registers. */
24583 static void
24584 ix86_dependencies_evaluation_hook (rtx head, rtx tail)
24586 rtx insn;
24587 rtx first_arg = NULL;
24588 if (reload_completed)
24589 return;
24590 while (head != tail && DEBUG_INSN_P (head))
24591 head = NEXT_INSN (head);
24592 for (insn = tail; insn != head; insn = PREV_INSN (insn))
24593 if (INSN_P (insn) && CALL_P (insn))
24595 first_arg = add_parameter_dependencies (insn, head);
24596 if (first_arg)
24598 /* Add dependee for first argument to predecessors if only
24599 region contains more than one block. */
24600 basic_block bb = BLOCK_FOR_INSN (insn);
24601 int rgn = CONTAINING_RGN (bb->index);
24602 int nr_blks = RGN_NR_BLOCKS (rgn);
24603 /* Skip trivial regions and region head blocks that can have
24604 predecessors outside of region. */
24605 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
24607 edge e;
24608 edge_iterator ei;
24609 /* Assume that region is SCC, i.e. all immediate predecessors
24610 of non-head block are in the same region. */
24611 FOR_EACH_EDGE (e, ei, bb->preds)
24613 /* Avoid creating of loop-carried dependencies through
24614 using topological odering in region. */
24615 if (BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
24616 add_dependee_for_func_arg (first_arg, e->src);
24619 insn = first_arg;
24620 if (insn == head)
24621 break;
24624 else if (first_arg)
24625 avoid_func_arg_motion (first_arg, insn);
24628 /* Hook for pre-reload schedule - set priority of moves from likely spilled
24629 HW registers to maximum, to schedule them at soon as possible. These are
24630 moves from function argument registers at the top of the function entry
24631 and moves from function return value registers after call. */
24632 static int
24633 ix86_adjust_priority (rtx insn, int priority)
24635 rtx set;
24637 if (reload_completed)
24638 return priority;
24640 if (!NONDEBUG_INSN_P (insn))
24641 return priority;
24643 set = single_set (insn);
24644 if (set)
24646 rtx tmp = SET_SRC (set);
24647 if (REG_P (tmp)
24648 && HARD_REGISTER_P (tmp)
24649 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
24650 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
24651 return current_sched_info->sched_max_insns_priority;
24654 return priority;
24657 /* Model decoder of Core 2/i7.
24658 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
24659 track the instruction fetch block boundaries and make sure that long
24660 (9+ bytes) instructions are assigned to D0. */
24662 /* Maximum length of an insn that can be handled by
24663 a secondary decoder unit. '8' for Core 2/i7. */
24664 static int core2i7_secondary_decoder_max_insn_size;
24666 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
24667 '16' for Core 2/i7. */
24668 static int core2i7_ifetch_block_size;
24670 /* Maximum number of instructions decoder can handle per cycle.
24671 '6' for Core 2/i7. */
24672 static int core2i7_ifetch_block_max_insns;
24674 typedef struct ix86_first_cycle_multipass_data_ *
24675 ix86_first_cycle_multipass_data_t;
24676 typedef const struct ix86_first_cycle_multipass_data_ *
24677 const_ix86_first_cycle_multipass_data_t;
24679 /* A variable to store target state across calls to max_issue within
24680 one cycle. */
24681 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
24682 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
24684 /* Initialize DATA. */
24685 static void
24686 core2i7_first_cycle_multipass_init (void *_data)
24688 ix86_first_cycle_multipass_data_t data
24689 = (ix86_first_cycle_multipass_data_t) _data;
24691 data->ifetch_block_len = 0;
24692 data->ifetch_block_n_insns = 0;
24693 data->ready_try_change = NULL;
24694 data->ready_try_change_size = 0;
24697 /* Advancing the cycle; reset ifetch block counts. */
24698 static void
24699 core2i7_dfa_post_advance_cycle (void)
24701 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
24703 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
24705 data->ifetch_block_len = 0;
24706 data->ifetch_block_n_insns = 0;
24709 static int min_insn_size (rtx);
24711 /* Filter out insns from ready_try that the core will not be able to issue
24712 on current cycle due to decoder. */
24713 static void
24714 core2i7_first_cycle_multipass_filter_ready_try
24715 (const_ix86_first_cycle_multipass_data_t data,
24716 char *ready_try, int n_ready, bool first_cycle_insn_p)
24718 while (n_ready--)
24720 rtx insn;
24721 int insn_size;
24723 if (ready_try[n_ready])
24724 continue;
24726 insn = get_ready_element (n_ready);
24727 insn_size = min_insn_size (insn);
24729 if (/* If this is a too long an insn for a secondary decoder ... */
24730 (!first_cycle_insn_p
24731 && insn_size > core2i7_secondary_decoder_max_insn_size)
24732 /* ... or it would not fit into the ifetch block ... */
24733 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
24734 /* ... or the decoder is full already ... */
24735 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
24736 /* ... mask the insn out. */
24738 ready_try[n_ready] = 1;
24740 if (data->ready_try_change)
24741 bitmap_set_bit (data->ready_try_change, n_ready);
24746 /* Prepare for a new round of multipass lookahead scheduling. */
24747 static void
24748 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
24749 bool first_cycle_insn_p)
24751 ix86_first_cycle_multipass_data_t data
24752 = (ix86_first_cycle_multipass_data_t) _data;
24753 const_ix86_first_cycle_multipass_data_t prev_data
24754 = ix86_first_cycle_multipass_data;
24756 /* Restore the state from the end of the previous round. */
24757 data->ifetch_block_len = prev_data->ifetch_block_len;
24758 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
24760 /* Filter instructions that cannot be issued on current cycle due to
24761 decoder restrictions. */
24762 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24763 first_cycle_insn_p);
24766 /* INSN is being issued in current solution. Account for its impact on
24767 the decoder model. */
24768 static void
24769 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
24770 rtx insn, const void *_prev_data)
24772 ix86_first_cycle_multipass_data_t data
24773 = (ix86_first_cycle_multipass_data_t) _data;
24774 const_ix86_first_cycle_multipass_data_t prev_data
24775 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
24777 int insn_size = min_insn_size (insn);
24779 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
24780 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
24781 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
24782 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
24784 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
24785 if (!data->ready_try_change)
24787 data->ready_try_change = sbitmap_alloc (n_ready);
24788 data->ready_try_change_size = n_ready;
24790 else if (data->ready_try_change_size < n_ready)
24792 data->ready_try_change = sbitmap_resize (data->ready_try_change,
24793 n_ready, 0);
24794 data->ready_try_change_size = n_ready;
24796 bitmap_clear (data->ready_try_change);
24798 /* Filter out insns from ready_try that the core will not be able to issue
24799 on current cycle due to decoder. */
24800 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24801 false);
24804 /* Revert the effect on ready_try. */
24805 static void
24806 core2i7_first_cycle_multipass_backtrack (const void *_data,
24807 char *ready_try,
24808 int n_ready ATTRIBUTE_UNUSED)
24810 const_ix86_first_cycle_multipass_data_t data
24811 = (const_ix86_first_cycle_multipass_data_t) _data;
24812 unsigned int i = 0;
24813 sbitmap_iterator sbi;
24815 gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready);
24816 EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi)
24818 ready_try[i] = 0;
24822 /* Save the result of multipass lookahead scheduling for the next round. */
24823 static void
24824 core2i7_first_cycle_multipass_end (const void *_data)
24826 const_ix86_first_cycle_multipass_data_t data
24827 = (const_ix86_first_cycle_multipass_data_t) _data;
24828 ix86_first_cycle_multipass_data_t next_data
24829 = ix86_first_cycle_multipass_data;
24831 if (data != NULL)
24833 next_data->ifetch_block_len = data->ifetch_block_len;
24834 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
24838 /* Deallocate target data. */
24839 static void
24840 core2i7_first_cycle_multipass_fini (void *_data)
24842 ix86_first_cycle_multipass_data_t data
24843 = (ix86_first_cycle_multipass_data_t) _data;
24845 if (data->ready_try_change)
24847 sbitmap_free (data->ready_try_change);
24848 data->ready_try_change = NULL;
24849 data->ready_try_change_size = 0;
24853 /* Prepare for scheduling pass. */
24854 static void
24855 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
24856 int verbose ATTRIBUTE_UNUSED,
24857 int max_uid ATTRIBUTE_UNUSED)
24859 /* Install scheduling hooks for current CPU. Some of these hooks are used
24860 in time-critical parts of the scheduler, so we only set them up when
24861 they are actually used. */
24862 switch (ix86_tune)
24864 case PROCESSOR_CORE2:
24865 case PROCESSOR_COREI7:
24866 /* Do not perform multipass scheduling for pre-reload schedule
24867 to save compile time. */
24868 if (reload_completed)
24870 targetm.sched.dfa_post_advance_cycle
24871 = core2i7_dfa_post_advance_cycle;
24872 targetm.sched.first_cycle_multipass_init
24873 = core2i7_first_cycle_multipass_init;
24874 targetm.sched.first_cycle_multipass_begin
24875 = core2i7_first_cycle_multipass_begin;
24876 targetm.sched.first_cycle_multipass_issue
24877 = core2i7_first_cycle_multipass_issue;
24878 targetm.sched.first_cycle_multipass_backtrack
24879 = core2i7_first_cycle_multipass_backtrack;
24880 targetm.sched.first_cycle_multipass_end
24881 = core2i7_first_cycle_multipass_end;
24882 targetm.sched.first_cycle_multipass_fini
24883 = core2i7_first_cycle_multipass_fini;
24885 /* Set decoder parameters. */
24886 core2i7_secondary_decoder_max_insn_size = 8;
24887 core2i7_ifetch_block_size = 16;
24888 core2i7_ifetch_block_max_insns = 6;
24889 break;
24891 /* ... Fall through ... */
24892 default:
24893 targetm.sched.dfa_post_advance_cycle = NULL;
24894 targetm.sched.first_cycle_multipass_init = NULL;
24895 targetm.sched.first_cycle_multipass_begin = NULL;
24896 targetm.sched.first_cycle_multipass_issue = NULL;
24897 targetm.sched.first_cycle_multipass_backtrack = NULL;
24898 targetm.sched.first_cycle_multipass_end = NULL;
24899 targetm.sched.first_cycle_multipass_fini = NULL;
24900 break;
24905 /* Compute the alignment given to a constant that is being placed in memory.
24906 EXP is the constant and ALIGN is the alignment that the object would
24907 ordinarily have.
24908 The value of this function is used instead of that alignment to align
24909 the object. */
24912 ix86_constant_alignment (tree exp, int align)
24914 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
24915 || TREE_CODE (exp) == INTEGER_CST)
24917 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
24918 return 64;
24919 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
24920 return 128;
24922 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
24923 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
24924 return BITS_PER_WORD;
24926 return align;
24929 /* Compute the alignment for a static variable.
24930 TYPE is the data type, and ALIGN is the alignment that
24931 the object would ordinarily have. The value of this function is used
24932 instead of that alignment to align the object. */
24935 ix86_data_alignment (tree type, int align)
24937 int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
24939 if (AGGREGATE_TYPE_P (type)
24940 && TYPE_SIZE (type)
24941 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24942 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
24943 || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
24944 && align < max_align)
24945 align = max_align;
24947 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
24948 to 16byte boundary. */
24949 if (TARGET_64BIT)
24951 if (AGGREGATE_TYPE_P (type)
24952 && TYPE_SIZE (type)
24953 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24954 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
24955 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24956 return 128;
24959 if (TREE_CODE (type) == ARRAY_TYPE)
24961 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24962 return 64;
24963 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
24964 return 128;
24966 else if (TREE_CODE (type) == COMPLEX_TYPE)
24969 if (TYPE_MODE (type) == DCmode && align < 64)
24970 return 64;
24971 if ((TYPE_MODE (type) == XCmode
24972 || TYPE_MODE (type) == TCmode) && align < 128)
24973 return 128;
24975 else if ((TREE_CODE (type) == RECORD_TYPE
24976 || TREE_CODE (type) == UNION_TYPE
24977 || TREE_CODE (type) == QUAL_UNION_TYPE)
24978 && TYPE_FIELDS (type))
24980 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
24981 return 64;
24982 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
24983 return 128;
24985 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
24986 || TREE_CODE (type) == INTEGER_TYPE)
24988 if (TYPE_MODE (type) == DFmode && align < 64)
24989 return 64;
24990 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
24991 return 128;
24994 return align;
24997 /* Compute the alignment for a local variable or a stack slot. EXP is
24998 the data type or decl itself, MODE is the widest mode available and
24999 ALIGN is the alignment that the object would ordinarily have. The
25000 value of this macro is used instead of that alignment to align the
25001 object. */
25003 unsigned int
25004 ix86_local_alignment (tree exp, enum machine_mode mode,
25005 unsigned int align)
25007 tree type, decl;
25009 if (exp && DECL_P (exp))
25011 type = TREE_TYPE (exp);
25012 decl = exp;
25014 else
25016 type = exp;
25017 decl = NULL;
25020 /* Don't do dynamic stack realignment for long long objects with
25021 -mpreferred-stack-boundary=2. */
25022 if (!TARGET_64BIT
25023 && align == 64
25024 && ix86_preferred_stack_boundary < 64
25025 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
25026 && (!type || !TYPE_USER_ALIGN (type))
25027 && (!decl || !DECL_USER_ALIGN (decl)))
25028 align = 32;
25030 /* If TYPE is NULL, we are allocating a stack slot for caller-save
25031 register in MODE. We will return the largest alignment of XF
25032 and DF. */
25033 if (!type)
25035 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
25036 align = GET_MODE_ALIGNMENT (DFmode);
25037 return align;
25040 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
25041 to 16byte boundary. Exact wording is:
25043 An array uses the same alignment as its elements, except that a local or
25044 global array variable of length at least 16 bytes or
25045 a C99 variable-length array variable always has alignment of at least 16 bytes.
25047 This was added to allow use of aligned SSE instructions at arrays. This
25048 rule is meant for static storage (where compiler can not do the analysis
25049 by itself). We follow it for automatic variables only when convenient.
25050 We fully control everything in the function compiled and functions from
25051 other unit can not rely on the alignment.
25053 Exclude va_list type. It is the common case of local array where
25054 we can not benefit from the alignment. */
25055 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
25056 && TARGET_SSE)
25058 if (AGGREGATE_TYPE_P (type)
25059 && (va_list_type_node == NULL_TREE
25060 || (TYPE_MAIN_VARIANT (type)
25061 != TYPE_MAIN_VARIANT (va_list_type_node)))
25062 && TYPE_SIZE (type)
25063 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
25064 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
25065 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
25066 return 128;
25068 if (TREE_CODE (type) == ARRAY_TYPE)
25070 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
25071 return 64;
25072 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
25073 return 128;
25075 else if (TREE_CODE (type) == COMPLEX_TYPE)
25077 if (TYPE_MODE (type) == DCmode && align < 64)
25078 return 64;
25079 if ((TYPE_MODE (type) == XCmode
25080 || TYPE_MODE (type) == TCmode) && align < 128)
25081 return 128;
25083 else if ((TREE_CODE (type) == RECORD_TYPE
25084 || TREE_CODE (type) == UNION_TYPE
25085 || TREE_CODE (type) == QUAL_UNION_TYPE)
25086 && TYPE_FIELDS (type))
25088 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
25089 return 64;
25090 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
25091 return 128;
25093 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
25094 || TREE_CODE (type) == INTEGER_TYPE)
25097 if (TYPE_MODE (type) == DFmode && align < 64)
25098 return 64;
25099 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
25100 return 128;
25102 return align;
25105 /* Compute the minimum required alignment for dynamic stack realignment
25106 purposes for a local variable, parameter or a stack slot. EXP is
25107 the data type or decl itself, MODE is its mode and ALIGN is the
25108 alignment that the object would ordinarily have. */
25110 unsigned int
25111 ix86_minimum_alignment (tree exp, enum machine_mode mode,
25112 unsigned int align)
25114 tree type, decl;
25116 if (exp && DECL_P (exp))
25118 type = TREE_TYPE (exp);
25119 decl = exp;
25121 else
25123 type = exp;
25124 decl = NULL;
25127 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
25128 return align;
25130 /* Don't do dynamic stack realignment for long long objects with
25131 -mpreferred-stack-boundary=2. */
25132 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
25133 && (!type || !TYPE_USER_ALIGN (type))
25134 && (!decl || !DECL_USER_ALIGN (decl)))
25135 return 32;
25137 return align;
25140 /* Find a location for the static chain incoming to a nested function.
25141 This is a register, unless all free registers are used by arguments. */
25143 static rtx
25144 ix86_static_chain (const_tree fndecl, bool incoming_p)
25146 unsigned regno;
25148 if (!DECL_STATIC_CHAIN (fndecl))
25149 return NULL;
25151 if (TARGET_64BIT)
25153 /* We always use R10 in 64-bit mode. */
25154 regno = R10_REG;
25156 else
25158 tree fntype;
25159 unsigned int ccvt;
25161 /* By default in 32-bit mode we use ECX to pass the static chain. */
25162 regno = CX_REG;
25164 fntype = TREE_TYPE (fndecl);
25165 ccvt = ix86_get_callcvt (fntype);
25166 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
25168 /* Fastcall functions use ecx/edx for arguments, which leaves
25169 us with EAX for the static chain.
25170 Thiscall functions use ecx for arguments, which also
25171 leaves us with EAX for the static chain. */
25172 regno = AX_REG;
25174 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
25176 /* Thiscall functions use ecx for arguments, which leaves
25177 us with EAX and EDX for the static chain.
25178 We are using for abi-compatibility EAX. */
25179 regno = AX_REG;
25181 else if (ix86_function_regparm (fntype, fndecl) == 3)
25183 /* For regparm 3, we have no free call-clobbered registers in
25184 which to store the static chain. In order to implement this,
25185 we have the trampoline push the static chain to the stack.
25186 However, we can't push a value below the return address when
25187 we call the nested function directly, so we have to use an
25188 alternate entry point. For this we use ESI, and have the
25189 alternate entry point push ESI, so that things appear the
25190 same once we're executing the nested function. */
25191 if (incoming_p)
25193 if (fndecl == current_function_decl)
25194 ix86_static_chain_on_stack = true;
25195 return gen_frame_mem (SImode,
25196 plus_constant (Pmode,
25197 arg_pointer_rtx, -8));
25199 regno = SI_REG;
25203 return gen_rtx_REG (Pmode, regno);
25206 /* Emit RTL insns to initialize the variable parts of a trampoline.
25207 FNDECL is the decl of the target address; M_TRAMP is a MEM for
25208 the trampoline, and CHAIN_VALUE is an RTX for the static chain
25209 to be passed to the target function. */
25211 static void
25212 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
25214 rtx mem, fnaddr;
25215 int opcode;
25216 int offset = 0;
25218 fnaddr = XEXP (DECL_RTL (fndecl), 0);
25220 if (TARGET_64BIT)
25222 int size;
25224 /* Load the function address to r11. Try to load address using
25225 the shorter movl instead of movabs. We may want to support
25226 movq for kernel mode, but kernel does not use trampolines at
25227 the moment. FNADDR is a 32bit address and may not be in
25228 DImode when ptr_mode == SImode. Always use movl in this
25229 case. */
25230 if (ptr_mode == SImode
25231 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
25233 fnaddr = copy_addr_to_reg (fnaddr);
25235 mem = adjust_address (m_tramp, HImode, offset);
25236 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
25238 mem = adjust_address (m_tramp, SImode, offset + 2);
25239 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
25240 offset += 6;
25242 else
25244 mem = adjust_address (m_tramp, HImode, offset);
25245 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
25247 mem = adjust_address (m_tramp, DImode, offset + 2);
25248 emit_move_insn (mem, fnaddr);
25249 offset += 10;
25252 /* Load static chain using movabs to r10. Use the shorter movl
25253 instead of movabs when ptr_mode == SImode. */
25254 if (ptr_mode == SImode)
25256 opcode = 0xba41;
25257 size = 6;
25259 else
25261 opcode = 0xba49;
25262 size = 10;
25265 mem = adjust_address (m_tramp, HImode, offset);
25266 emit_move_insn (mem, gen_int_mode (opcode, HImode));
25268 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
25269 emit_move_insn (mem, chain_value);
25270 offset += size;
25272 /* Jump to r11; the last (unused) byte is a nop, only there to
25273 pad the write out to a single 32-bit store. */
25274 mem = adjust_address (m_tramp, SImode, offset);
25275 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
25276 offset += 4;
25278 else
25280 rtx disp, chain;
25282 /* Depending on the static chain location, either load a register
25283 with a constant, or push the constant to the stack. All of the
25284 instructions are the same size. */
25285 chain = ix86_static_chain (fndecl, true);
25286 if (REG_P (chain))
25288 switch (REGNO (chain))
25290 case AX_REG:
25291 opcode = 0xb8; break;
25292 case CX_REG:
25293 opcode = 0xb9; break;
25294 default:
25295 gcc_unreachable ();
25298 else
25299 opcode = 0x68;
25301 mem = adjust_address (m_tramp, QImode, offset);
25302 emit_move_insn (mem, gen_int_mode (opcode, QImode));
25304 mem = adjust_address (m_tramp, SImode, offset + 1);
25305 emit_move_insn (mem, chain_value);
25306 offset += 5;
25308 mem = adjust_address (m_tramp, QImode, offset);
25309 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
25311 mem = adjust_address (m_tramp, SImode, offset + 1);
25313 /* Compute offset from the end of the jmp to the target function.
25314 In the case in which the trampoline stores the static chain on
25315 the stack, we need to skip the first insn which pushes the
25316 (call-saved) register static chain; this push is 1 byte. */
25317 offset += 5;
25318 disp = expand_binop (SImode, sub_optab, fnaddr,
25319 plus_constant (Pmode, XEXP (m_tramp, 0),
25320 offset - (MEM_P (chain) ? 1 : 0)),
25321 NULL_RTX, 1, OPTAB_DIRECT);
25322 emit_move_insn (mem, disp);
25325 gcc_assert (offset <= TRAMPOLINE_SIZE);
25327 #ifdef HAVE_ENABLE_EXECUTE_STACK
25328 #ifdef CHECK_EXECUTE_STACK_ENABLED
25329 if (CHECK_EXECUTE_STACK_ENABLED)
25330 #endif
25331 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
25332 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
25333 #endif
25336 /* The following file contains several enumerations and data structures
25337 built from the definitions in i386-builtin-types.def. */
25339 #include "i386-builtin-types.inc"
25341 /* Table for the ix86 builtin non-function types. */
25342 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
25344 /* Retrieve an element from the above table, building some of
25345 the types lazily. */
25347 static tree
25348 ix86_get_builtin_type (enum ix86_builtin_type tcode)
25350 unsigned int index;
25351 tree type, itype;
25353 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
25355 type = ix86_builtin_type_tab[(int) tcode];
25356 if (type != NULL)
25357 return type;
25359 gcc_assert (tcode > IX86_BT_LAST_PRIM);
25360 if (tcode <= IX86_BT_LAST_VECT)
25362 enum machine_mode mode;
25364 index = tcode - IX86_BT_LAST_PRIM - 1;
25365 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
25366 mode = ix86_builtin_type_vect_mode[index];
25368 type = build_vector_type_for_mode (itype, mode);
25370 else
25372 int quals;
25374 index = tcode - IX86_BT_LAST_VECT - 1;
25375 if (tcode <= IX86_BT_LAST_PTR)
25376 quals = TYPE_UNQUALIFIED;
25377 else
25378 quals = TYPE_QUAL_CONST;
25380 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
25381 if (quals != TYPE_UNQUALIFIED)
25382 itype = build_qualified_type (itype, quals);
25384 type = build_pointer_type (itype);
25387 ix86_builtin_type_tab[(int) tcode] = type;
25388 return type;
25391 /* Table for the ix86 builtin function types. */
25392 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
25394 /* Retrieve an element from the above table, building some of
25395 the types lazily. */
25397 static tree
25398 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
25400 tree type;
25402 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
25404 type = ix86_builtin_func_type_tab[(int) tcode];
25405 if (type != NULL)
25406 return type;
25408 if (tcode <= IX86_BT_LAST_FUNC)
25410 unsigned start = ix86_builtin_func_start[(int) tcode];
25411 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
25412 tree rtype, atype, args = void_list_node;
25413 unsigned i;
25415 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
25416 for (i = after - 1; i > start; --i)
25418 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
25419 args = tree_cons (NULL, atype, args);
25422 type = build_function_type (rtype, args);
25424 else
25426 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
25427 enum ix86_builtin_func_type icode;
25429 icode = ix86_builtin_func_alias_base[index];
25430 type = ix86_get_builtin_func_type (icode);
25433 ix86_builtin_func_type_tab[(int) tcode] = type;
25434 return type;
25438 /* Codes for all the SSE/MMX builtins. */
25439 enum ix86_builtins
25441 IX86_BUILTIN_ADDPS,
25442 IX86_BUILTIN_ADDSS,
25443 IX86_BUILTIN_DIVPS,
25444 IX86_BUILTIN_DIVSS,
25445 IX86_BUILTIN_MULPS,
25446 IX86_BUILTIN_MULSS,
25447 IX86_BUILTIN_SUBPS,
25448 IX86_BUILTIN_SUBSS,
25450 IX86_BUILTIN_CMPEQPS,
25451 IX86_BUILTIN_CMPLTPS,
25452 IX86_BUILTIN_CMPLEPS,
25453 IX86_BUILTIN_CMPGTPS,
25454 IX86_BUILTIN_CMPGEPS,
25455 IX86_BUILTIN_CMPNEQPS,
25456 IX86_BUILTIN_CMPNLTPS,
25457 IX86_BUILTIN_CMPNLEPS,
25458 IX86_BUILTIN_CMPNGTPS,
25459 IX86_BUILTIN_CMPNGEPS,
25460 IX86_BUILTIN_CMPORDPS,
25461 IX86_BUILTIN_CMPUNORDPS,
25462 IX86_BUILTIN_CMPEQSS,
25463 IX86_BUILTIN_CMPLTSS,
25464 IX86_BUILTIN_CMPLESS,
25465 IX86_BUILTIN_CMPNEQSS,
25466 IX86_BUILTIN_CMPNLTSS,
25467 IX86_BUILTIN_CMPNLESS,
25468 IX86_BUILTIN_CMPNGTSS,
25469 IX86_BUILTIN_CMPNGESS,
25470 IX86_BUILTIN_CMPORDSS,
25471 IX86_BUILTIN_CMPUNORDSS,
25473 IX86_BUILTIN_COMIEQSS,
25474 IX86_BUILTIN_COMILTSS,
25475 IX86_BUILTIN_COMILESS,
25476 IX86_BUILTIN_COMIGTSS,
25477 IX86_BUILTIN_COMIGESS,
25478 IX86_BUILTIN_COMINEQSS,
25479 IX86_BUILTIN_UCOMIEQSS,
25480 IX86_BUILTIN_UCOMILTSS,
25481 IX86_BUILTIN_UCOMILESS,
25482 IX86_BUILTIN_UCOMIGTSS,
25483 IX86_BUILTIN_UCOMIGESS,
25484 IX86_BUILTIN_UCOMINEQSS,
25486 IX86_BUILTIN_CVTPI2PS,
25487 IX86_BUILTIN_CVTPS2PI,
25488 IX86_BUILTIN_CVTSI2SS,
25489 IX86_BUILTIN_CVTSI642SS,
25490 IX86_BUILTIN_CVTSS2SI,
25491 IX86_BUILTIN_CVTSS2SI64,
25492 IX86_BUILTIN_CVTTPS2PI,
25493 IX86_BUILTIN_CVTTSS2SI,
25494 IX86_BUILTIN_CVTTSS2SI64,
25496 IX86_BUILTIN_MAXPS,
25497 IX86_BUILTIN_MAXSS,
25498 IX86_BUILTIN_MINPS,
25499 IX86_BUILTIN_MINSS,
25501 IX86_BUILTIN_LOADUPS,
25502 IX86_BUILTIN_STOREUPS,
25503 IX86_BUILTIN_MOVSS,
25505 IX86_BUILTIN_MOVHLPS,
25506 IX86_BUILTIN_MOVLHPS,
25507 IX86_BUILTIN_LOADHPS,
25508 IX86_BUILTIN_LOADLPS,
25509 IX86_BUILTIN_STOREHPS,
25510 IX86_BUILTIN_STORELPS,
25512 IX86_BUILTIN_MASKMOVQ,
25513 IX86_BUILTIN_MOVMSKPS,
25514 IX86_BUILTIN_PMOVMSKB,
25516 IX86_BUILTIN_MOVNTPS,
25517 IX86_BUILTIN_MOVNTQ,
25519 IX86_BUILTIN_LOADDQU,
25520 IX86_BUILTIN_STOREDQU,
25522 IX86_BUILTIN_PACKSSWB,
25523 IX86_BUILTIN_PACKSSDW,
25524 IX86_BUILTIN_PACKUSWB,
25526 IX86_BUILTIN_PADDB,
25527 IX86_BUILTIN_PADDW,
25528 IX86_BUILTIN_PADDD,
25529 IX86_BUILTIN_PADDQ,
25530 IX86_BUILTIN_PADDSB,
25531 IX86_BUILTIN_PADDSW,
25532 IX86_BUILTIN_PADDUSB,
25533 IX86_BUILTIN_PADDUSW,
25534 IX86_BUILTIN_PSUBB,
25535 IX86_BUILTIN_PSUBW,
25536 IX86_BUILTIN_PSUBD,
25537 IX86_BUILTIN_PSUBQ,
25538 IX86_BUILTIN_PSUBSB,
25539 IX86_BUILTIN_PSUBSW,
25540 IX86_BUILTIN_PSUBUSB,
25541 IX86_BUILTIN_PSUBUSW,
25543 IX86_BUILTIN_PAND,
25544 IX86_BUILTIN_PANDN,
25545 IX86_BUILTIN_POR,
25546 IX86_BUILTIN_PXOR,
25548 IX86_BUILTIN_PAVGB,
25549 IX86_BUILTIN_PAVGW,
25551 IX86_BUILTIN_PCMPEQB,
25552 IX86_BUILTIN_PCMPEQW,
25553 IX86_BUILTIN_PCMPEQD,
25554 IX86_BUILTIN_PCMPGTB,
25555 IX86_BUILTIN_PCMPGTW,
25556 IX86_BUILTIN_PCMPGTD,
25558 IX86_BUILTIN_PMADDWD,
25560 IX86_BUILTIN_PMAXSW,
25561 IX86_BUILTIN_PMAXUB,
25562 IX86_BUILTIN_PMINSW,
25563 IX86_BUILTIN_PMINUB,
25565 IX86_BUILTIN_PMULHUW,
25566 IX86_BUILTIN_PMULHW,
25567 IX86_BUILTIN_PMULLW,
25569 IX86_BUILTIN_PSADBW,
25570 IX86_BUILTIN_PSHUFW,
25572 IX86_BUILTIN_PSLLW,
25573 IX86_BUILTIN_PSLLD,
25574 IX86_BUILTIN_PSLLQ,
25575 IX86_BUILTIN_PSRAW,
25576 IX86_BUILTIN_PSRAD,
25577 IX86_BUILTIN_PSRLW,
25578 IX86_BUILTIN_PSRLD,
25579 IX86_BUILTIN_PSRLQ,
25580 IX86_BUILTIN_PSLLWI,
25581 IX86_BUILTIN_PSLLDI,
25582 IX86_BUILTIN_PSLLQI,
25583 IX86_BUILTIN_PSRAWI,
25584 IX86_BUILTIN_PSRADI,
25585 IX86_BUILTIN_PSRLWI,
25586 IX86_BUILTIN_PSRLDI,
25587 IX86_BUILTIN_PSRLQI,
25589 IX86_BUILTIN_PUNPCKHBW,
25590 IX86_BUILTIN_PUNPCKHWD,
25591 IX86_BUILTIN_PUNPCKHDQ,
25592 IX86_BUILTIN_PUNPCKLBW,
25593 IX86_BUILTIN_PUNPCKLWD,
25594 IX86_BUILTIN_PUNPCKLDQ,
25596 IX86_BUILTIN_SHUFPS,
25598 IX86_BUILTIN_RCPPS,
25599 IX86_BUILTIN_RCPSS,
25600 IX86_BUILTIN_RSQRTPS,
25601 IX86_BUILTIN_RSQRTPS_NR,
25602 IX86_BUILTIN_RSQRTSS,
25603 IX86_BUILTIN_RSQRTF,
25604 IX86_BUILTIN_SQRTPS,
25605 IX86_BUILTIN_SQRTPS_NR,
25606 IX86_BUILTIN_SQRTSS,
25608 IX86_BUILTIN_UNPCKHPS,
25609 IX86_BUILTIN_UNPCKLPS,
25611 IX86_BUILTIN_ANDPS,
25612 IX86_BUILTIN_ANDNPS,
25613 IX86_BUILTIN_ORPS,
25614 IX86_BUILTIN_XORPS,
25616 IX86_BUILTIN_EMMS,
25617 IX86_BUILTIN_LDMXCSR,
25618 IX86_BUILTIN_STMXCSR,
25619 IX86_BUILTIN_SFENCE,
25621 IX86_BUILTIN_FXSAVE,
25622 IX86_BUILTIN_FXRSTOR,
25623 IX86_BUILTIN_FXSAVE64,
25624 IX86_BUILTIN_FXRSTOR64,
25626 IX86_BUILTIN_XSAVE,
25627 IX86_BUILTIN_XRSTOR,
25628 IX86_BUILTIN_XSAVE64,
25629 IX86_BUILTIN_XRSTOR64,
25631 IX86_BUILTIN_XSAVEOPT,
25632 IX86_BUILTIN_XSAVEOPT64,
25634 /* 3DNow! Original */
25635 IX86_BUILTIN_FEMMS,
25636 IX86_BUILTIN_PAVGUSB,
25637 IX86_BUILTIN_PF2ID,
25638 IX86_BUILTIN_PFACC,
25639 IX86_BUILTIN_PFADD,
25640 IX86_BUILTIN_PFCMPEQ,
25641 IX86_BUILTIN_PFCMPGE,
25642 IX86_BUILTIN_PFCMPGT,
25643 IX86_BUILTIN_PFMAX,
25644 IX86_BUILTIN_PFMIN,
25645 IX86_BUILTIN_PFMUL,
25646 IX86_BUILTIN_PFRCP,
25647 IX86_BUILTIN_PFRCPIT1,
25648 IX86_BUILTIN_PFRCPIT2,
25649 IX86_BUILTIN_PFRSQIT1,
25650 IX86_BUILTIN_PFRSQRT,
25651 IX86_BUILTIN_PFSUB,
25652 IX86_BUILTIN_PFSUBR,
25653 IX86_BUILTIN_PI2FD,
25654 IX86_BUILTIN_PMULHRW,
25656 /* 3DNow! Athlon Extensions */
25657 IX86_BUILTIN_PF2IW,
25658 IX86_BUILTIN_PFNACC,
25659 IX86_BUILTIN_PFPNACC,
25660 IX86_BUILTIN_PI2FW,
25661 IX86_BUILTIN_PSWAPDSI,
25662 IX86_BUILTIN_PSWAPDSF,
25664 /* SSE2 */
25665 IX86_BUILTIN_ADDPD,
25666 IX86_BUILTIN_ADDSD,
25667 IX86_BUILTIN_DIVPD,
25668 IX86_BUILTIN_DIVSD,
25669 IX86_BUILTIN_MULPD,
25670 IX86_BUILTIN_MULSD,
25671 IX86_BUILTIN_SUBPD,
25672 IX86_BUILTIN_SUBSD,
25674 IX86_BUILTIN_CMPEQPD,
25675 IX86_BUILTIN_CMPLTPD,
25676 IX86_BUILTIN_CMPLEPD,
25677 IX86_BUILTIN_CMPGTPD,
25678 IX86_BUILTIN_CMPGEPD,
25679 IX86_BUILTIN_CMPNEQPD,
25680 IX86_BUILTIN_CMPNLTPD,
25681 IX86_BUILTIN_CMPNLEPD,
25682 IX86_BUILTIN_CMPNGTPD,
25683 IX86_BUILTIN_CMPNGEPD,
25684 IX86_BUILTIN_CMPORDPD,
25685 IX86_BUILTIN_CMPUNORDPD,
25686 IX86_BUILTIN_CMPEQSD,
25687 IX86_BUILTIN_CMPLTSD,
25688 IX86_BUILTIN_CMPLESD,
25689 IX86_BUILTIN_CMPNEQSD,
25690 IX86_BUILTIN_CMPNLTSD,
25691 IX86_BUILTIN_CMPNLESD,
25692 IX86_BUILTIN_CMPORDSD,
25693 IX86_BUILTIN_CMPUNORDSD,
25695 IX86_BUILTIN_COMIEQSD,
25696 IX86_BUILTIN_COMILTSD,
25697 IX86_BUILTIN_COMILESD,
25698 IX86_BUILTIN_COMIGTSD,
25699 IX86_BUILTIN_COMIGESD,
25700 IX86_BUILTIN_COMINEQSD,
25701 IX86_BUILTIN_UCOMIEQSD,
25702 IX86_BUILTIN_UCOMILTSD,
25703 IX86_BUILTIN_UCOMILESD,
25704 IX86_BUILTIN_UCOMIGTSD,
25705 IX86_BUILTIN_UCOMIGESD,
25706 IX86_BUILTIN_UCOMINEQSD,
25708 IX86_BUILTIN_MAXPD,
25709 IX86_BUILTIN_MAXSD,
25710 IX86_BUILTIN_MINPD,
25711 IX86_BUILTIN_MINSD,
25713 IX86_BUILTIN_ANDPD,
25714 IX86_BUILTIN_ANDNPD,
25715 IX86_BUILTIN_ORPD,
25716 IX86_BUILTIN_XORPD,
25718 IX86_BUILTIN_SQRTPD,
25719 IX86_BUILTIN_SQRTSD,
25721 IX86_BUILTIN_UNPCKHPD,
25722 IX86_BUILTIN_UNPCKLPD,
25724 IX86_BUILTIN_SHUFPD,
25726 IX86_BUILTIN_LOADUPD,
25727 IX86_BUILTIN_STOREUPD,
25728 IX86_BUILTIN_MOVSD,
25730 IX86_BUILTIN_LOADHPD,
25731 IX86_BUILTIN_LOADLPD,
25733 IX86_BUILTIN_CVTDQ2PD,
25734 IX86_BUILTIN_CVTDQ2PS,
25736 IX86_BUILTIN_CVTPD2DQ,
25737 IX86_BUILTIN_CVTPD2PI,
25738 IX86_BUILTIN_CVTPD2PS,
25739 IX86_BUILTIN_CVTTPD2DQ,
25740 IX86_BUILTIN_CVTTPD2PI,
25742 IX86_BUILTIN_CVTPI2PD,
25743 IX86_BUILTIN_CVTSI2SD,
25744 IX86_BUILTIN_CVTSI642SD,
25746 IX86_BUILTIN_CVTSD2SI,
25747 IX86_BUILTIN_CVTSD2SI64,
25748 IX86_BUILTIN_CVTSD2SS,
25749 IX86_BUILTIN_CVTSS2SD,
25750 IX86_BUILTIN_CVTTSD2SI,
25751 IX86_BUILTIN_CVTTSD2SI64,
25753 IX86_BUILTIN_CVTPS2DQ,
25754 IX86_BUILTIN_CVTPS2PD,
25755 IX86_BUILTIN_CVTTPS2DQ,
25757 IX86_BUILTIN_MOVNTI,
25758 IX86_BUILTIN_MOVNTI64,
25759 IX86_BUILTIN_MOVNTPD,
25760 IX86_BUILTIN_MOVNTDQ,
25762 IX86_BUILTIN_MOVQ128,
25764 /* SSE2 MMX */
25765 IX86_BUILTIN_MASKMOVDQU,
25766 IX86_BUILTIN_MOVMSKPD,
25767 IX86_BUILTIN_PMOVMSKB128,
25769 IX86_BUILTIN_PACKSSWB128,
25770 IX86_BUILTIN_PACKSSDW128,
25771 IX86_BUILTIN_PACKUSWB128,
25773 IX86_BUILTIN_PADDB128,
25774 IX86_BUILTIN_PADDW128,
25775 IX86_BUILTIN_PADDD128,
25776 IX86_BUILTIN_PADDQ128,
25777 IX86_BUILTIN_PADDSB128,
25778 IX86_BUILTIN_PADDSW128,
25779 IX86_BUILTIN_PADDUSB128,
25780 IX86_BUILTIN_PADDUSW128,
25781 IX86_BUILTIN_PSUBB128,
25782 IX86_BUILTIN_PSUBW128,
25783 IX86_BUILTIN_PSUBD128,
25784 IX86_BUILTIN_PSUBQ128,
25785 IX86_BUILTIN_PSUBSB128,
25786 IX86_BUILTIN_PSUBSW128,
25787 IX86_BUILTIN_PSUBUSB128,
25788 IX86_BUILTIN_PSUBUSW128,
25790 IX86_BUILTIN_PAND128,
25791 IX86_BUILTIN_PANDN128,
25792 IX86_BUILTIN_POR128,
25793 IX86_BUILTIN_PXOR128,
25795 IX86_BUILTIN_PAVGB128,
25796 IX86_BUILTIN_PAVGW128,
25798 IX86_BUILTIN_PCMPEQB128,
25799 IX86_BUILTIN_PCMPEQW128,
25800 IX86_BUILTIN_PCMPEQD128,
25801 IX86_BUILTIN_PCMPGTB128,
25802 IX86_BUILTIN_PCMPGTW128,
25803 IX86_BUILTIN_PCMPGTD128,
25805 IX86_BUILTIN_PMADDWD128,
25807 IX86_BUILTIN_PMAXSW128,
25808 IX86_BUILTIN_PMAXUB128,
25809 IX86_BUILTIN_PMINSW128,
25810 IX86_BUILTIN_PMINUB128,
25812 IX86_BUILTIN_PMULUDQ,
25813 IX86_BUILTIN_PMULUDQ128,
25814 IX86_BUILTIN_PMULHUW128,
25815 IX86_BUILTIN_PMULHW128,
25816 IX86_BUILTIN_PMULLW128,
25818 IX86_BUILTIN_PSADBW128,
25819 IX86_BUILTIN_PSHUFHW,
25820 IX86_BUILTIN_PSHUFLW,
25821 IX86_BUILTIN_PSHUFD,
25823 IX86_BUILTIN_PSLLDQI128,
25824 IX86_BUILTIN_PSLLWI128,
25825 IX86_BUILTIN_PSLLDI128,
25826 IX86_BUILTIN_PSLLQI128,
25827 IX86_BUILTIN_PSRAWI128,
25828 IX86_BUILTIN_PSRADI128,
25829 IX86_BUILTIN_PSRLDQI128,
25830 IX86_BUILTIN_PSRLWI128,
25831 IX86_BUILTIN_PSRLDI128,
25832 IX86_BUILTIN_PSRLQI128,
25834 IX86_BUILTIN_PSLLDQ128,
25835 IX86_BUILTIN_PSLLW128,
25836 IX86_BUILTIN_PSLLD128,
25837 IX86_BUILTIN_PSLLQ128,
25838 IX86_BUILTIN_PSRAW128,
25839 IX86_BUILTIN_PSRAD128,
25840 IX86_BUILTIN_PSRLW128,
25841 IX86_BUILTIN_PSRLD128,
25842 IX86_BUILTIN_PSRLQ128,
25844 IX86_BUILTIN_PUNPCKHBW128,
25845 IX86_BUILTIN_PUNPCKHWD128,
25846 IX86_BUILTIN_PUNPCKHDQ128,
25847 IX86_BUILTIN_PUNPCKHQDQ128,
25848 IX86_BUILTIN_PUNPCKLBW128,
25849 IX86_BUILTIN_PUNPCKLWD128,
25850 IX86_BUILTIN_PUNPCKLDQ128,
25851 IX86_BUILTIN_PUNPCKLQDQ128,
25853 IX86_BUILTIN_CLFLUSH,
25854 IX86_BUILTIN_MFENCE,
25855 IX86_BUILTIN_LFENCE,
25856 IX86_BUILTIN_PAUSE,
25858 IX86_BUILTIN_BSRSI,
25859 IX86_BUILTIN_BSRDI,
25860 IX86_BUILTIN_RDPMC,
25861 IX86_BUILTIN_RDTSC,
25862 IX86_BUILTIN_RDTSCP,
25863 IX86_BUILTIN_ROLQI,
25864 IX86_BUILTIN_ROLHI,
25865 IX86_BUILTIN_RORQI,
25866 IX86_BUILTIN_RORHI,
25868 /* SSE3. */
25869 IX86_BUILTIN_ADDSUBPS,
25870 IX86_BUILTIN_HADDPS,
25871 IX86_BUILTIN_HSUBPS,
25872 IX86_BUILTIN_MOVSHDUP,
25873 IX86_BUILTIN_MOVSLDUP,
25874 IX86_BUILTIN_ADDSUBPD,
25875 IX86_BUILTIN_HADDPD,
25876 IX86_BUILTIN_HSUBPD,
25877 IX86_BUILTIN_LDDQU,
25879 IX86_BUILTIN_MONITOR,
25880 IX86_BUILTIN_MWAIT,
25882 /* SSSE3. */
25883 IX86_BUILTIN_PHADDW,
25884 IX86_BUILTIN_PHADDD,
25885 IX86_BUILTIN_PHADDSW,
25886 IX86_BUILTIN_PHSUBW,
25887 IX86_BUILTIN_PHSUBD,
25888 IX86_BUILTIN_PHSUBSW,
25889 IX86_BUILTIN_PMADDUBSW,
25890 IX86_BUILTIN_PMULHRSW,
25891 IX86_BUILTIN_PSHUFB,
25892 IX86_BUILTIN_PSIGNB,
25893 IX86_BUILTIN_PSIGNW,
25894 IX86_BUILTIN_PSIGND,
25895 IX86_BUILTIN_PALIGNR,
25896 IX86_BUILTIN_PABSB,
25897 IX86_BUILTIN_PABSW,
25898 IX86_BUILTIN_PABSD,
25900 IX86_BUILTIN_PHADDW128,
25901 IX86_BUILTIN_PHADDD128,
25902 IX86_BUILTIN_PHADDSW128,
25903 IX86_BUILTIN_PHSUBW128,
25904 IX86_BUILTIN_PHSUBD128,
25905 IX86_BUILTIN_PHSUBSW128,
25906 IX86_BUILTIN_PMADDUBSW128,
25907 IX86_BUILTIN_PMULHRSW128,
25908 IX86_BUILTIN_PSHUFB128,
25909 IX86_BUILTIN_PSIGNB128,
25910 IX86_BUILTIN_PSIGNW128,
25911 IX86_BUILTIN_PSIGND128,
25912 IX86_BUILTIN_PALIGNR128,
25913 IX86_BUILTIN_PABSB128,
25914 IX86_BUILTIN_PABSW128,
25915 IX86_BUILTIN_PABSD128,
25917 /* AMDFAM10 - SSE4A New Instructions. */
25918 IX86_BUILTIN_MOVNTSD,
25919 IX86_BUILTIN_MOVNTSS,
25920 IX86_BUILTIN_EXTRQI,
25921 IX86_BUILTIN_EXTRQ,
25922 IX86_BUILTIN_INSERTQI,
25923 IX86_BUILTIN_INSERTQ,
25925 /* SSE4.1. */
25926 IX86_BUILTIN_BLENDPD,
25927 IX86_BUILTIN_BLENDPS,
25928 IX86_BUILTIN_BLENDVPD,
25929 IX86_BUILTIN_BLENDVPS,
25930 IX86_BUILTIN_PBLENDVB128,
25931 IX86_BUILTIN_PBLENDW128,
25933 IX86_BUILTIN_DPPD,
25934 IX86_BUILTIN_DPPS,
25936 IX86_BUILTIN_INSERTPS128,
25938 IX86_BUILTIN_MOVNTDQA,
25939 IX86_BUILTIN_MPSADBW128,
25940 IX86_BUILTIN_PACKUSDW128,
25941 IX86_BUILTIN_PCMPEQQ,
25942 IX86_BUILTIN_PHMINPOSUW128,
25944 IX86_BUILTIN_PMAXSB128,
25945 IX86_BUILTIN_PMAXSD128,
25946 IX86_BUILTIN_PMAXUD128,
25947 IX86_BUILTIN_PMAXUW128,
25949 IX86_BUILTIN_PMINSB128,
25950 IX86_BUILTIN_PMINSD128,
25951 IX86_BUILTIN_PMINUD128,
25952 IX86_BUILTIN_PMINUW128,
25954 IX86_BUILTIN_PMOVSXBW128,
25955 IX86_BUILTIN_PMOVSXBD128,
25956 IX86_BUILTIN_PMOVSXBQ128,
25957 IX86_BUILTIN_PMOVSXWD128,
25958 IX86_BUILTIN_PMOVSXWQ128,
25959 IX86_BUILTIN_PMOVSXDQ128,
25961 IX86_BUILTIN_PMOVZXBW128,
25962 IX86_BUILTIN_PMOVZXBD128,
25963 IX86_BUILTIN_PMOVZXBQ128,
25964 IX86_BUILTIN_PMOVZXWD128,
25965 IX86_BUILTIN_PMOVZXWQ128,
25966 IX86_BUILTIN_PMOVZXDQ128,
25968 IX86_BUILTIN_PMULDQ128,
25969 IX86_BUILTIN_PMULLD128,
25971 IX86_BUILTIN_ROUNDSD,
25972 IX86_BUILTIN_ROUNDSS,
25974 IX86_BUILTIN_ROUNDPD,
25975 IX86_BUILTIN_ROUNDPS,
25977 IX86_BUILTIN_FLOORPD,
25978 IX86_BUILTIN_CEILPD,
25979 IX86_BUILTIN_TRUNCPD,
25980 IX86_BUILTIN_RINTPD,
25981 IX86_BUILTIN_ROUNDPD_AZ,
25983 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX,
25984 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX,
25985 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX,
25987 IX86_BUILTIN_FLOORPS,
25988 IX86_BUILTIN_CEILPS,
25989 IX86_BUILTIN_TRUNCPS,
25990 IX86_BUILTIN_RINTPS,
25991 IX86_BUILTIN_ROUNDPS_AZ,
25993 IX86_BUILTIN_FLOORPS_SFIX,
25994 IX86_BUILTIN_CEILPS_SFIX,
25995 IX86_BUILTIN_ROUNDPS_AZ_SFIX,
25997 IX86_BUILTIN_PTESTZ,
25998 IX86_BUILTIN_PTESTC,
25999 IX86_BUILTIN_PTESTNZC,
26001 IX86_BUILTIN_VEC_INIT_V2SI,
26002 IX86_BUILTIN_VEC_INIT_V4HI,
26003 IX86_BUILTIN_VEC_INIT_V8QI,
26004 IX86_BUILTIN_VEC_EXT_V2DF,
26005 IX86_BUILTIN_VEC_EXT_V2DI,
26006 IX86_BUILTIN_VEC_EXT_V4SF,
26007 IX86_BUILTIN_VEC_EXT_V4SI,
26008 IX86_BUILTIN_VEC_EXT_V8HI,
26009 IX86_BUILTIN_VEC_EXT_V2SI,
26010 IX86_BUILTIN_VEC_EXT_V4HI,
26011 IX86_BUILTIN_VEC_EXT_V16QI,
26012 IX86_BUILTIN_VEC_SET_V2DI,
26013 IX86_BUILTIN_VEC_SET_V4SF,
26014 IX86_BUILTIN_VEC_SET_V4SI,
26015 IX86_BUILTIN_VEC_SET_V8HI,
26016 IX86_BUILTIN_VEC_SET_V4HI,
26017 IX86_BUILTIN_VEC_SET_V16QI,
26019 IX86_BUILTIN_VEC_PACK_SFIX,
26020 IX86_BUILTIN_VEC_PACK_SFIX256,
26022 /* SSE4.2. */
26023 IX86_BUILTIN_CRC32QI,
26024 IX86_BUILTIN_CRC32HI,
26025 IX86_BUILTIN_CRC32SI,
26026 IX86_BUILTIN_CRC32DI,
26028 IX86_BUILTIN_PCMPESTRI128,
26029 IX86_BUILTIN_PCMPESTRM128,
26030 IX86_BUILTIN_PCMPESTRA128,
26031 IX86_BUILTIN_PCMPESTRC128,
26032 IX86_BUILTIN_PCMPESTRO128,
26033 IX86_BUILTIN_PCMPESTRS128,
26034 IX86_BUILTIN_PCMPESTRZ128,
26035 IX86_BUILTIN_PCMPISTRI128,
26036 IX86_BUILTIN_PCMPISTRM128,
26037 IX86_BUILTIN_PCMPISTRA128,
26038 IX86_BUILTIN_PCMPISTRC128,
26039 IX86_BUILTIN_PCMPISTRO128,
26040 IX86_BUILTIN_PCMPISTRS128,
26041 IX86_BUILTIN_PCMPISTRZ128,
26043 IX86_BUILTIN_PCMPGTQ,
26045 /* AES instructions */
26046 IX86_BUILTIN_AESENC128,
26047 IX86_BUILTIN_AESENCLAST128,
26048 IX86_BUILTIN_AESDEC128,
26049 IX86_BUILTIN_AESDECLAST128,
26050 IX86_BUILTIN_AESIMC128,
26051 IX86_BUILTIN_AESKEYGENASSIST128,
26053 /* PCLMUL instruction */
26054 IX86_BUILTIN_PCLMULQDQ128,
26056 /* AVX */
26057 IX86_BUILTIN_ADDPD256,
26058 IX86_BUILTIN_ADDPS256,
26059 IX86_BUILTIN_ADDSUBPD256,
26060 IX86_BUILTIN_ADDSUBPS256,
26061 IX86_BUILTIN_ANDPD256,
26062 IX86_BUILTIN_ANDPS256,
26063 IX86_BUILTIN_ANDNPD256,
26064 IX86_BUILTIN_ANDNPS256,
26065 IX86_BUILTIN_BLENDPD256,
26066 IX86_BUILTIN_BLENDPS256,
26067 IX86_BUILTIN_BLENDVPD256,
26068 IX86_BUILTIN_BLENDVPS256,
26069 IX86_BUILTIN_DIVPD256,
26070 IX86_BUILTIN_DIVPS256,
26071 IX86_BUILTIN_DPPS256,
26072 IX86_BUILTIN_HADDPD256,
26073 IX86_BUILTIN_HADDPS256,
26074 IX86_BUILTIN_HSUBPD256,
26075 IX86_BUILTIN_HSUBPS256,
26076 IX86_BUILTIN_MAXPD256,
26077 IX86_BUILTIN_MAXPS256,
26078 IX86_BUILTIN_MINPD256,
26079 IX86_BUILTIN_MINPS256,
26080 IX86_BUILTIN_MULPD256,
26081 IX86_BUILTIN_MULPS256,
26082 IX86_BUILTIN_ORPD256,
26083 IX86_BUILTIN_ORPS256,
26084 IX86_BUILTIN_SHUFPD256,
26085 IX86_BUILTIN_SHUFPS256,
26086 IX86_BUILTIN_SUBPD256,
26087 IX86_BUILTIN_SUBPS256,
26088 IX86_BUILTIN_XORPD256,
26089 IX86_BUILTIN_XORPS256,
26090 IX86_BUILTIN_CMPSD,
26091 IX86_BUILTIN_CMPSS,
26092 IX86_BUILTIN_CMPPD,
26093 IX86_BUILTIN_CMPPS,
26094 IX86_BUILTIN_CMPPD256,
26095 IX86_BUILTIN_CMPPS256,
26096 IX86_BUILTIN_CVTDQ2PD256,
26097 IX86_BUILTIN_CVTDQ2PS256,
26098 IX86_BUILTIN_CVTPD2PS256,
26099 IX86_BUILTIN_CVTPS2DQ256,
26100 IX86_BUILTIN_CVTPS2PD256,
26101 IX86_BUILTIN_CVTTPD2DQ256,
26102 IX86_BUILTIN_CVTPD2DQ256,
26103 IX86_BUILTIN_CVTTPS2DQ256,
26104 IX86_BUILTIN_EXTRACTF128PD256,
26105 IX86_BUILTIN_EXTRACTF128PS256,
26106 IX86_BUILTIN_EXTRACTF128SI256,
26107 IX86_BUILTIN_VZEROALL,
26108 IX86_BUILTIN_VZEROUPPER,
26109 IX86_BUILTIN_VPERMILVARPD,
26110 IX86_BUILTIN_VPERMILVARPS,
26111 IX86_BUILTIN_VPERMILVARPD256,
26112 IX86_BUILTIN_VPERMILVARPS256,
26113 IX86_BUILTIN_VPERMILPD,
26114 IX86_BUILTIN_VPERMILPS,
26115 IX86_BUILTIN_VPERMILPD256,
26116 IX86_BUILTIN_VPERMILPS256,
26117 IX86_BUILTIN_VPERMIL2PD,
26118 IX86_BUILTIN_VPERMIL2PS,
26119 IX86_BUILTIN_VPERMIL2PD256,
26120 IX86_BUILTIN_VPERMIL2PS256,
26121 IX86_BUILTIN_VPERM2F128PD256,
26122 IX86_BUILTIN_VPERM2F128PS256,
26123 IX86_BUILTIN_VPERM2F128SI256,
26124 IX86_BUILTIN_VBROADCASTSS,
26125 IX86_BUILTIN_VBROADCASTSD256,
26126 IX86_BUILTIN_VBROADCASTSS256,
26127 IX86_BUILTIN_VBROADCASTPD256,
26128 IX86_BUILTIN_VBROADCASTPS256,
26129 IX86_BUILTIN_VINSERTF128PD256,
26130 IX86_BUILTIN_VINSERTF128PS256,
26131 IX86_BUILTIN_VINSERTF128SI256,
26132 IX86_BUILTIN_LOADUPD256,
26133 IX86_BUILTIN_LOADUPS256,
26134 IX86_BUILTIN_STOREUPD256,
26135 IX86_BUILTIN_STOREUPS256,
26136 IX86_BUILTIN_LDDQU256,
26137 IX86_BUILTIN_MOVNTDQ256,
26138 IX86_BUILTIN_MOVNTPD256,
26139 IX86_BUILTIN_MOVNTPS256,
26140 IX86_BUILTIN_LOADDQU256,
26141 IX86_BUILTIN_STOREDQU256,
26142 IX86_BUILTIN_MASKLOADPD,
26143 IX86_BUILTIN_MASKLOADPS,
26144 IX86_BUILTIN_MASKSTOREPD,
26145 IX86_BUILTIN_MASKSTOREPS,
26146 IX86_BUILTIN_MASKLOADPD256,
26147 IX86_BUILTIN_MASKLOADPS256,
26148 IX86_BUILTIN_MASKSTOREPD256,
26149 IX86_BUILTIN_MASKSTOREPS256,
26150 IX86_BUILTIN_MOVSHDUP256,
26151 IX86_BUILTIN_MOVSLDUP256,
26152 IX86_BUILTIN_MOVDDUP256,
26154 IX86_BUILTIN_SQRTPD256,
26155 IX86_BUILTIN_SQRTPS256,
26156 IX86_BUILTIN_SQRTPS_NR256,
26157 IX86_BUILTIN_RSQRTPS256,
26158 IX86_BUILTIN_RSQRTPS_NR256,
26160 IX86_BUILTIN_RCPPS256,
26162 IX86_BUILTIN_ROUNDPD256,
26163 IX86_BUILTIN_ROUNDPS256,
26165 IX86_BUILTIN_FLOORPD256,
26166 IX86_BUILTIN_CEILPD256,
26167 IX86_BUILTIN_TRUNCPD256,
26168 IX86_BUILTIN_RINTPD256,
26169 IX86_BUILTIN_ROUNDPD_AZ256,
26171 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256,
26172 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256,
26173 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256,
26175 IX86_BUILTIN_FLOORPS256,
26176 IX86_BUILTIN_CEILPS256,
26177 IX86_BUILTIN_TRUNCPS256,
26178 IX86_BUILTIN_RINTPS256,
26179 IX86_BUILTIN_ROUNDPS_AZ256,
26181 IX86_BUILTIN_FLOORPS_SFIX256,
26182 IX86_BUILTIN_CEILPS_SFIX256,
26183 IX86_BUILTIN_ROUNDPS_AZ_SFIX256,
26185 IX86_BUILTIN_UNPCKHPD256,
26186 IX86_BUILTIN_UNPCKLPD256,
26187 IX86_BUILTIN_UNPCKHPS256,
26188 IX86_BUILTIN_UNPCKLPS256,
26190 IX86_BUILTIN_SI256_SI,
26191 IX86_BUILTIN_PS256_PS,
26192 IX86_BUILTIN_PD256_PD,
26193 IX86_BUILTIN_SI_SI256,
26194 IX86_BUILTIN_PS_PS256,
26195 IX86_BUILTIN_PD_PD256,
26197 IX86_BUILTIN_VTESTZPD,
26198 IX86_BUILTIN_VTESTCPD,
26199 IX86_BUILTIN_VTESTNZCPD,
26200 IX86_BUILTIN_VTESTZPS,
26201 IX86_BUILTIN_VTESTCPS,
26202 IX86_BUILTIN_VTESTNZCPS,
26203 IX86_BUILTIN_VTESTZPD256,
26204 IX86_BUILTIN_VTESTCPD256,
26205 IX86_BUILTIN_VTESTNZCPD256,
26206 IX86_BUILTIN_VTESTZPS256,
26207 IX86_BUILTIN_VTESTCPS256,
26208 IX86_BUILTIN_VTESTNZCPS256,
26209 IX86_BUILTIN_PTESTZ256,
26210 IX86_BUILTIN_PTESTC256,
26211 IX86_BUILTIN_PTESTNZC256,
26213 IX86_BUILTIN_MOVMSKPD256,
26214 IX86_BUILTIN_MOVMSKPS256,
26216 /* AVX2 */
26217 IX86_BUILTIN_MPSADBW256,
26218 IX86_BUILTIN_PABSB256,
26219 IX86_BUILTIN_PABSW256,
26220 IX86_BUILTIN_PABSD256,
26221 IX86_BUILTIN_PACKSSDW256,
26222 IX86_BUILTIN_PACKSSWB256,
26223 IX86_BUILTIN_PACKUSDW256,
26224 IX86_BUILTIN_PACKUSWB256,
26225 IX86_BUILTIN_PADDB256,
26226 IX86_BUILTIN_PADDW256,
26227 IX86_BUILTIN_PADDD256,
26228 IX86_BUILTIN_PADDQ256,
26229 IX86_BUILTIN_PADDSB256,
26230 IX86_BUILTIN_PADDSW256,
26231 IX86_BUILTIN_PADDUSB256,
26232 IX86_BUILTIN_PADDUSW256,
26233 IX86_BUILTIN_PALIGNR256,
26234 IX86_BUILTIN_AND256I,
26235 IX86_BUILTIN_ANDNOT256I,
26236 IX86_BUILTIN_PAVGB256,
26237 IX86_BUILTIN_PAVGW256,
26238 IX86_BUILTIN_PBLENDVB256,
26239 IX86_BUILTIN_PBLENDVW256,
26240 IX86_BUILTIN_PCMPEQB256,
26241 IX86_BUILTIN_PCMPEQW256,
26242 IX86_BUILTIN_PCMPEQD256,
26243 IX86_BUILTIN_PCMPEQQ256,
26244 IX86_BUILTIN_PCMPGTB256,
26245 IX86_BUILTIN_PCMPGTW256,
26246 IX86_BUILTIN_PCMPGTD256,
26247 IX86_BUILTIN_PCMPGTQ256,
26248 IX86_BUILTIN_PHADDW256,
26249 IX86_BUILTIN_PHADDD256,
26250 IX86_BUILTIN_PHADDSW256,
26251 IX86_BUILTIN_PHSUBW256,
26252 IX86_BUILTIN_PHSUBD256,
26253 IX86_BUILTIN_PHSUBSW256,
26254 IX86_BUILTIN_PMADDUBSW256,
26255 IX86_BUILTIN_PMADDWD256,
26256 IX86_BUILTIN_PMAXSB256,
26257 IX86_BUILTIN_PMAXSW256,
26258 IX86_BUILTIN_PMAXSD256,
26259 IX86_BUILTIN_PMAXUB256,
26260 IX86_BUILTIN_PMAXUW256,
26261 IX86_BUILTIN_PMAXUD256,
26262 IX86_BUILTIN_PMINSB256,
26263 IX86_BUILTIN_PMINSW256,
26264 IX86_BUILTIN_PMINSD256,
26265 IX86_BUILTIN_PMINUB256,
26266 IX86_BUILTIN_PMINUW256,
26267 IX86_BUILTIN_PMINUD256,
26268 IX86_BUILTIN_PMOVMSKB256,
26269 IX86_BUILTIN_PMOVSXBW256,
26270 IX86_BUILTIN_PMOVSXBD256,
26271 IX86_BUILTIN_PMOVSXBQ256,
26272 IX86_BUILTIN_PMOVSXWD256,
26273 IX86_BUILTIN_PMOVSXWQ256,
26274 IX86_BUILTIN_PMOVSXDQ256,
26275 IX86_BUILTIN_PMOVZXBW256,
26276 IX86_BUILTIN_PMOVZXBD256,
26277 IX86_BUILTIN_PMOVZXBQ256,
26278 IX86_BUILTIN_PMOVZXWD256,
26279 IX86_BUILTIN_PMOVZXWQ256,
26280 IX86_BUILTIN_PMOVZXDQ256,
26281 IX86_BUILTIN_PMULDQ256,
26282 IX86_BUILTIN_PMULHRSW256,
26283 IX86_BUILTIN_PMULHUW256,
26284 IX86_BUILTIN_PMULHW256,
26285 IX86_BUILTIN_PMULLW256,
26286 IX86_BUILTIN_PMULLD256,
26287 IX86_BUILTIN_PMULUDQ256,
26288 IX86_BUILTIN_POR256,
26289 IX86_BUILTIN_PSADBW256,
26290 IX86_BUILTIN_PSHUFB256,
26291 IX86_BUILTIN_PSHUFD256,
26292 IX86_BUILTIN_PSHUFHW256,
26293 IX86_BUILTIN_PSHUFLW256,
26294 IX86_BUILTIN_PSIGNB256,
26295 IX86_BUILTIN_PSIGNW256,
26296 IX86_BUILTIN_PSIGND256,
26297 IX86_BUILTIN_PSLLDQI256,
26298 IX86_BUILTIN_PSLLWI256,
26299 IX86_BUILTIN_PSLLW256,
26300 IX86_BUILTIN_PSLLDI256,
26301 IX86_BUILTIN_PSLLD256,
26302 IX86_BUILTIN_PSLLQI256,
26303 IX86_BUILTIN_PSLLQ256,
26304 IX86_BUILTIN_PSRAWI256,
26305 IX86_BUILTIN_PSRAW256,
26306 IX86_BUILTIN_PSRADI256,
26307 IX86_BUILTIN_PSRAD256,
26308 IX86_BUILTIN_PSRLDQI256,
26309 IX86_BUILTIN_PSRLWI256,
26310 IX86_BUILTIN_PSRLW256,
26311 IX86_BUILTIN_PSRLDI256,
26312 IX86_BUILTIN_PSRLD256,
26313 IX86_BUILTIN_PSRLQI256,
26314 IX86_BUILTIN_PSRLQ256,
26315 IX86_BUILTIN_PSUBB256,
26316 IX86_BUILTIN_PSUBW256,
26317 IX86_BUILTIN_PSUBD256,
26318 IX86_BUILTIN_PSUBQ256,
26319 IX86_BUILTIN_PSUBSB256,
26320 IX86_BUILTIN_PSUBSW256,
26321 IX86_BUILTIN_PSUBUSB256,
26322 IX86_BUILTIN_PSUBUSW256,
26323 IX86_BUILTIN_PUNPCKHBW256,
26324 IX86_BUILTIN_PUNPCKHWD256,
26325 IX86_BUILTIN_PUNPCKHDQ256,
26326 IX86_BUILTIN_PUNPCKHQDQ256,
26327 IX86_BUILTIN_PUNPCKLBW256,
26328 IX86_BUILTIN_PUNPCKLWD256,
26329 IX86_BUILTIN_PUNPCKLDQ256,
26330 IX86_BUILTIN_PUNPCKLQDQ256,
26331 IX86_BUILTIN_PXOR256,
26332 IX86_BUILTIN_MOVNTDQA256,
26333 IX86_BUILTIN_VBROADCASTSS_PS,
26334 IX86_BUILTIN_VBROADCASTSS_PS256,
26335 IX86_BUILTIN_VBROADCASTSD_PD256,
26336 IX86_BUILTIN_VBROADCASTSI256,
26337 IX86_BUILTIN_PBLENDD256,
26338 IX86_BUILTIN_PBLENDD128,
26339 IX86_BUILTIN_PBROADCASTB256,
26340 IX86_BUILTIN_PBROADCASTW256,
26341 IX86_BUILTIN_PBROADCASTD256,
26342 IX86_BUILTIN_PBROADCASTQ256,
26343 IX86_BUILTIN_PBROADCASTB128,
26344 IX86_BUILTIN_PBROADCASTW128,
26345 IX86_BUILTIN_PBROADCASTD128,
26346 IX86_BUILTIN_PBROADCASTQ128,
26347 IX86_BUILTIN_VPERMVARSI256,
26348 IX86_BUILTIN_VPERMDF256,
26349 IX86_BUILTIN_VPERMVARSF256,
26350 IX86_BUILTIN_VPERMDI256,
26351 IX86_BUILTIN_VPERMTI256,
26352 IX86_BUILTIN_VEXTRACT128I256,
26353 IX86_BUILTIN_VINSERT128I256,
26354 IX86_BUILTIN_MASKLOADD,
26355 IX86_BUILTIN_MASKLOADQ,
26356 IX86_BUILTIN_MASKLOADD256,
26357 IX86_BUILTIN_MASKLOADQ256,
26358 IX86_BUILTIN_MASKSTORED,
26359 IX86_BUILTIN_MASKSTOREQ,
26360 IX86_BUILTIN_MASKSTORED256,
26361 IX86_BUILTIN_MASKSTOREQ256,
26362 IX86_BUILTIN_PSLLVV4DI,
26363 IX86_BUILTIN_PSLLVV2DI,
26364 IX86_BUILTIN_PSLLVV8SI,
26365 IX86_BUILTIN_PSLLVV4SI,
26366 IX86_BUILTIN_PSRAVV8SI,
26367 IX86_BUILTIN_PSRAVV4SI,
26368 IX86_BUILTIN_PSRLVV4DI,
26369 IX86_BUILTIN_PSRLVV2DI,
26370 IX86_BUILTIN_PSRLVV8SI,
26371 IX86_BUILTIN_PSRLVV4SI,
26373 IX86_BUILTIN_GATHERSIV2DF,
26374 IX86_BUILTIN_GATHERSIV4DF,
26375 IX86_BUILTIN_GATHERDIV2DF,
26376 IX86_BUILTIN_GATHERDIV4DF,
26377 IX86_BUILTIN_GATHERSIV4SF,
26378 IX86_BUILTIN_GATHERSIV8SF,
26379 IX86_BUILTIN_GATHERDIV4SF,
26380 IX86_BUILTIN_GATHERDIV8SF,
26381 IX86_BUILTIN_GATHERSIV2DI,
26382 IX86_BUILTIN_GATHERSIV4DI,
26383 IX86_BUILTIN_GATHERDIV2DI,
26384 IX86_BUILTIN_GATHERDIV4DI,
26385 IX86_BUILTIN_GATHERSIV4SI,
26386 IX86_BUILTIN_GATHERSIV8SI,
26387 IX86_BUILTIN_GATHERDIV4SI,
26388 IX86_BUILTIN_GATHERDIV8SI,
26390 /* Alternate 4 element gather for the vectorizer where
26391 all operands are 32-byte wide. */
26392 IX86_BUILTIN_GATHERALTSIV4DF,
26393 IX86_BUILTIN_GATHERALTDIV8SF,
26394 IX86_BUILTIN_GATHERALTSIV4DI,
26395 IX86_BUILTIN_GATHERALTDIV8SI,
26397 /* TFmode support builtins. */
26398 IX86_BUILTIN_INFQ,
26399 IX86_BUILTIN_HUGE_VALQ,
26400 IX86_BUILTIN_FABSQ,
26401 IX86_BUILTIN_COPYSIGNQ,
26403 /* Vectorizer support builtins. */
26404 IX86_BUILTIN_CPYSGNPS,
26405 IX86_BUILTIN_CPYSGNPD,
26406 IX86_BUILTIN_CPYSGNPS256,
26407 IX86_BUILTIN_CPYSGNPD256,
26409 /* FMA4 instructions. */
26410 IX86_BUILTIN_VFMADDSS,
26411 IX86_BUILTIN_VFMADDSD,
26412 IX86_BUILTIN_VFMADDPS,
26413 IX86_BUILTIN_VFMADDPD,
26414 IX86_BUILTIN_VFMADDPS256,
26415 IX86_BUILTIN_VFMADDPD256,
26416 IX86_BUILTIN_VFMADDSUBPS,
26417 IX86_BUILTIN_VFMADDSUBPD,
26418 IX86_BUILTIN_VFMADDSUBPS256,
26419 IX86_BUILTIN_VFMADDSUBPD256,
26421 /* FMA3 instructions. */
26422 IX86_BUILTIN_VFMADDSS3,
26423 IX86_BUILTIN_VFMADDSD3,
26425 /* XOP instructions. */
26426 IX86_BUILTIN_VPCMOV,
26427 IX86_BUILTIN_VPCMOV_V2DI,
26428 IX86_BUILTIN_VPCMOV_V4SI,
26429 IX86_BUILTIN_VPCMOV_V8HI,
26430 IX86_BUILTIN_VPCMOV_V16QI,
26431 IX86_BUILTIN_VPCMOV_V4SF,
26432 IX86_BUILTIN_VPCMOV_V2DF,
26433 IX86_BUILTIN_VPCMOV256,
26434 IX86_BUILTIN_VPCMOV_V4DI256,
26435 IX86_BUILTIN_VPCMOV_V8SI256,
26436 IX86_BUILTIN_VPCMOV_V16HI256,
26437 IX86_BUILTIN_VPCMOV_V32QI256,
26438 IX86_BUILTIN_VPCMOV_V8SF256,
26439 IX86_BUILTIN_VPCMOV_V4DF256,
26441 IX86_BUILTIN_VPPERM,
26443 IX86_BUILTIN_VPMACSSWW,
26444 IX86_BUILTIN_VPMACSWW,
26445 IX86_BUILTIN_VPMACSSWD,
26446 IX86_BUILTIN_VPMACSWD,
26447 IX86_BUILTIN_VPMACSSDD,
26448 IX86_BUILTIN_VPMACSDD,
26449 IX86_BUILTIN_VPMACSSDQL,
26450 IX86_BUILTIN_VPMACSSDQH,
26451 IX86_BUILTIN_VPMACSDQL,
26452 IX86_BUILTIN_VPMACSDQH,
26453 IX86_BUILTIN_VPMADCSSWD,
26454 IX86_BUILTIN_VPMADCSWD,
26456 IX86_BUILTIN_VPHADDBW,
26457 IX86_BUILTIN_VPHADDBD,
26458 IX86_BUILTIN_VPHADDBQ,
26459 IX86_BUILTIN_VPHADDWD,
26460 IX86_BUILTIN_VPHADDWQ,
26461 IX86_BUILTIN_VPHADDDQ,
26462 IX86_BUILTIN_VPHADDUBW,
26463 IX86_BUILTIN_VPHADDUBD,
26464 IX86_BUILTIN_VPHADDUBQ,
26465 IX86_BUILTIN_VPHADDUWD,
26466 IX86_BUILTIN_VPHADDUWQ,
26467 IX86_BUILTIN_VPHADDUDQ,
26468 IX86_BUILTIN_VPHSUBBW,
26469 IX86_BUILTIN_VPHSUBWD,
26470 IX86_BUILTIN_VPHSUBDQ,
26472 IX86_BUILTIN_VPROTB,
26473 IX86_BUILTIN_VPROTW,
26474 IX86_BUILTIN_VPROTD,
26475 IX86_BUILTIN_VPROTQ,
26476 IX86_BUILTIN_VPROTB_IMM,
26477 IX86_BUILTIN_VPROTW_IMM,
26478 IX86_BUILTIN_VPROTD_IMM,
26479 IX86_BUILTIN_VPROTQ_IMM,
26481 IX86_BUILTIN_VPSHLB,
26482 IX86_BUILTIN_VPSHLW,
26483 IX86_BUILTIN_VPSHLD,
26484 IX86_BUILTIN_VPSHLQ,
26485 IX86_BUILTIN_VPSHAB,
26486 IX86_BUILTIN_VPSHAW,
26487 IX86_BUILTIN_VPSHAD,
26488 IX86_BUILTIN_VPSHAQ,
26490 IX86_BUILTIN_VFRCZSS,
26491 IX86_BUILTIN_VFRCZSD,
26492 IX86_BUILTIN_VFRCZPS,
26493 IX86_BUILTIN_VFRCZPD,
26494 IX86_BUILTIN_VFRCZPS256,
26495 IX86_BUILTIN_VFRCZPD256,
26497 IX86_BUILTIN_VPCOMEQUB,
26498 IX86_BUILTIN_VPCOMNEUB,
26499 IX86_BUILTIN_VPCOMLTUB,
26500 IX86_BUILTIN_VPCOMLEUB,
26501 IX86_BUILTIN_VPCOMGTUB,
26502 IX86_BUILTIN_VPCOMGEUB,
26503 IX86_BUILTIN_VPCOMFALSEUB,
26504 IX86_BUILTIN_VPCOMTRUEUB,
26506 IX86_BUILTIN_VPCOMEQUW,
26507 IX86_BUILTIN_VPCOMNEUW,
26508 IX86_BUILTIN_VPCOMLTUW,
26509 IX86_BUILTIN_VPCOMLEUW,
26510 IX86_BUILTIN_VPCOMGTUW,
26511 IX86_BUILTIN_VPCOMGEUW,
26512 IX86_BUILTIN_VPCOMFALSEUW,
26513 IX86_BUILTIN_VPCOMTRUEUW,
26515 IX86_BUILTIN_VPCOMEQUD,
26516 IX86_BUILTIN_VPCOMNEUD,
26517 IX86_BUILTIN_VPCOMLTUD,
26518 IX86_BUILTIN_VPCOMLEUD,
26519 IX86_BUILTIN_VPCOMGTUD,
26520 IX86_BUILTIN_VPCOMGEUD,
26521 IX86_BUILTIN_VPCOMFALSEUD,
26522 IX86_BUILTIN_VPCOMTRUEUD,
26524 IX86_BUILTIN_VPCOMEQUQ,
26525 IX86_BUILTIN_VPCOMNEUQ,
26526 IX86_BUILTIN_VPCOMLTUQ,
26527 IX86_BUILTIN_VPCOMLEUQ,
26528 IX86_BUILTIN_VPCOMGTUQ,
26529 IX86_BUILTIN_VPCOMGEUQ,
26530 IX86_BUILTIN_VPCOMFALSEUQ,
26531 IX86_BUILTIN_VPCOMTRUEUQ,
26533 IX86_BUILTIN_VPCOMEQB,
26534 IX86_BUILTIN_VPCOMNEB,
26535 IX86_BUILTIN_VPCOMLTB,
26536 IX86_BUILTIN_VPCOMLEB,
26537 IX86_BUILTIN_VPCOMGTB,
26538 IX86_BUILTIN_VPCOMGEB,
26539 IX86_BUILTIN_VPCOMFALSEB,
26540 IX86_BUILTIN_VPCOMTRUEB,
26542 IX86_BUILTIN_VPCOMEQW,
26543 IX86_BUILTIN_VPCOMNEW,
26544 IX86_BUILTIN_VPCOMLTW,
26545 IX86_BUILTIN_VPCOMLEW,
26546 IX86_BUILTIN_VPCOMGTW,
26547 IX86_BUILTIN_VPCOMGEW,
26548 IX86_BUILTIN_VPCOMFALSEW,
26549 IX86_BUILTIN_VPCOMTRUEW,
26551 IX86_BUILTIN_VPCOMEQD,
26552 IX86_BUILTIN_VPCOMNED,
26553 IX86_BUILTIN_VPCOMLTD,
26554 IX86_BUILTIN_VPCOMLED,
26555 IX86_BUILTIN_VPCOMGTD,
26556 IX86_BUILTIN_VPCOMGED,
26557 IX86_BUILTIN_VPCOMFALSED,
26558 IX86_BUILTIN_VPCOMTRUED,
26560 IX86_BUILTIN_VPCOMEQQ,
26561 IX86_BUILTIN_VPCOMNEQ,
26562 IX86_BUILTIN_VPCOMLTQ,
26563 IX86_BUILTIN_VPCOMLEQ,
26564 IX86_BUILTIN_VPCOMGTQ,
26565 IX86_BUILTIN_VPCOMGEQ,
26566 IX86_BUILTIN_VPCOMFALSEQ,
26567 IX86_BUILTIN_VPCOMTRUEQ,
26569 /* LWP instructions. */
26570 IX86_BUILTIN_LLWPCB,
26571 IX86_BUILTIN_SLWPCB,
26572 IX86_BUILTIN_LWPVAL32,
26573 IX86_BUILTIN_LWPVAL64,
26574 IX86_BUILTIN_LWPINS32,
26575 IX86_BUILTIN_LWPINS64,
26577 IX86_BUILTIN_CLZS,
26579 /* RTM */
26580 IX86_BUILTIN_XBEGIN,
26581 IX86_BUILTIN_XEND,
26582 IX86_BUILTIN_XABORT,
26583 IX86_BUILTIN_XTEST,
26585 /* BMI instructions. */
26586 IX86_BUILTIN_BEXTR32,
26587 IX86_BUILTIN_BEXTR64,
26588 IX86_BUILTIN_CTZS,
26590 /* TBM instructions. */
26591 IX86_BUILTIN_BEXTRI32,
26592 IX86_BUILTIN_BEXTRI64,
26594 /* BMI2 instructions. */
26595 IX86_BUILTIN_BZHI32,
26596 IX86_BUILTIN_BZHI64,
26597 IX86_BUILTIN_PDEP32,
26598 IX86_BUILTIN_PDEP64,
26599 IX86_BUILTIN_PEXT32,
26600 IX86_BUILTIN_PEXT64,
26602 /* ADX instructions. */
26603 IX86_BUILTIN_ADDCARRYX32,
26604 IX86_BUILTIN_ADDCARRYX64,
26606 /* FSGSBASE instructions. */
26607 IX86_BUILTIN_RDFSBASE32,
26608 IX86_BUILTIN_RDFSBASE64,
26609 IX86_BUILTIN_RDGSBASE32,
26610 IX86_BUILTIN_RDGSBASE64,
26611 IX86_BUILTIN_WRFSBASE32,
26612 IX86_BUILTIN_WRFSBASE64,
26613 IX86_BUILTIN_WRGSBASE32,
26614 IX86_BUILTIN_WRGSBASE64,
26616 /* RDRND instructions. */
26617 IX86_BUILTIN_RDRAND16_STEP,
26618 IX86_BUILTIN_RDRAND32_STEP,
26619 IX86_BUILTIN_RDRAND64_STEP,
26621 /* RDSEED instructions. */
26622 IX86_BUILTIN_RDSEED16_STEP,
26623 IX86_BUILTIN_RDSEED32_STEP,
26624 IX86_BUILTIN_RDSEED64_STEP,
26626 /* F16C instructions. */
26627 IX86_BUILTIN_CVTPH2PS,
26628 IX86_BUILTIN_CVTPH2PS256,
26629 IX86_BUILTIN_CVTPS2PH,
26630 IX86_BUILTIN_CVTPS2PH256,
26632 /* CFString built-in for darwin */
26633 IX86_BUILTIN_CFSTRING,
26635 /* Builtins to get CPU type and supported features. */
26636 IX86_BUILTIN_CPU_INIT,
26637 IX86_BUILTIN_CPU_IS,
26638 IX86_BUILTIN_CPU_SUPPORTS,
26640 IX86_BUILTIN_MAX
26643 /* Table for the ix86 builtin decls. */
26644 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
26646 /* Table of all of the builtin functions that are possible with different ISA's
26647 but are waiting to be built until a function is declared to use that
26648 ISA. */
26649 struct builtin_isa {
26650 const char *name; /* function name */
26651 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
26652 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
26653 bool const_p; /* true if the declaration is constant */
26654 bool set_and_not_built_p;
26657 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
26660 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
26661 of which isa_flags to use in the ix86_builtins_isa array. Stores the
26662 function decl in the ix86_builtins array. Returns the function decl or
26663 NULL_TREE, if the builtin was not added.
26665 If the front end has a special hook for builtin functions, delay adding
26666 builtin functions that aren't in the current ISA until the ISA is changed
26667 with function specific optimization. Doing so, can save about 300K for the
26668 default compiler. When the builtin is expanded, check at that time whether
26669 it is valid.
26671 If the front end doesn't have a special hook, record all builtins, even if
26672 it isn't an instruction set in the current ISA in case the user uses
26673 function specific options for a different ISA, so that we don't get scope
26674 errors if a builtin is added in the middle of a function scope. */
26676 static inline tree
26677 def_builtin (HOST_WIDE_INT mask, const char *name,
26678 enum ix86_builtin_func_type tcode,
26679 enum ix86_builtins code)
26681 tree decl = NULL_TREE;
26683 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
26685 ix86_builtins_isa[(int) code].isa = mask;
26687 mask &= ~OPTION_MASK_ISA_64BIT;
26688 if (mask == 0
26689 || (mask & ix86_isa_flags) != 0
26690 || (lang_hooks.builtin_function
26691 == lang_hooks.builtin_function_ext_scope))
26694 tree type = ix86_get_builtin_func_type (tcode);
26695 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
26696 NULL, NULL_TREE);
26697 ix86_builtins[(int) code] = decl;
26698 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
26700 else
26702 ix86_builtins[(int) code] = NULL_TREE;
26703 ix86_builtins_isa[(int) code].tcode = tcode;
26704 ix86_builtins_isa[(int) code].name = name;
26705 ix86_builtins_isa[(int) code].const_p = false;
26706 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
26710 return decl;
26713 /* Like def_builtin, but also marks the function decl "const". */
26715 static inline tree
26716 def_builtin_const (HOST_WIDE_INT mask, const char *name,
26717 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
26719 tree decl = def_builtin (mask, name, tcode, code);
26720 if (decl)
26721 TREE_READONLY (decl) = 1;
26722 else
26723 ix86_builtins_isa[(int) code].const_p = true;
26725 return decl;
26728 /* Add any new builtin functions for a given ISA that may not have been
26729 declared. This saves a bit of space compared to adding all of the
26730 declarations to the tree, even if we didn't use them. */
26732 static void
26733 ix86_add_new_builtins (HOST_WIDE_INT isa)
26735 int i;
26737 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
26739 if ((ix86_builtins_isa[i].isa & isa) != 0
26740 && ix86_builtins_isa[i].set_and_not_built_p)
26742 tree decl, type;
26744 /* Don't define the builtin again. */
26745 ix86_builtins_isa[i].set_and_not_built_p = false;
26747 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
26748 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
26749 type, i, BUILT_IN_MD, NULL,
26750 NULL_TREE);
26752 ix86_builtins[i] = decl;
26753 if (ix86_builtins_isa[i].const_p)
26754 TREE_READONLY (decl) = 1;
26759 /* Bits for builtin_description.flag. */
26761 /* Set when we don't support the comparison natively, and should
26762 swap_comparison in order to support it. */
26763 #define BUILTIN_DESC_SWAP_OPERANDS 1
26765 struct builtin_description
26767 const HOST_WIDE_INT mask;
26768 const enum insn_code icode;
26769 const char *const name;
26770 const enum ix86_builtins code;
26771 const enum rtx_code comparison;
26772 const int flag;
26775 static const struct builtin_description bdesc_comi[] =
26777 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
26778 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
26779 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
26780 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
26781 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
26782 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
26783 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
26784 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
26785 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
26786 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
26787 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
26788 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
26789 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
26790 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
26791 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
26792 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
26793 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
26794 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
26795 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
26796 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
26797 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
26798 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
26799 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
26800 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
26803 static const struct builtin_description bdesc_pcmpestr[] =
26805 /* SSE4.2 */
26806 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
26807 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
26808 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
26809 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
26810 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
26811 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
26812 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
26815 static const struct builtin_description bdesc_pcmpistr[] =
26817 /* SSE4.2 */
26818 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
26819 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
26820 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
26821 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
26822 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
26823 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
26824 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
26827 /* Special builtins with variable number of arguments. */
26828 static const struct builtin_description bdesc_special_args[] =
26830 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
26831 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
26832 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
26834 /* MMX */
26835 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26837 /* 3DNow! */
26838 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26840 /* FXSR, XSAVE and XSAVEOPT */
26841 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxsave", IX86_BUILTIN_FXSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID },
26842 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxrstor", IX86_BUILTIN_FXRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID },
26843 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xsave", IX86_BUILTIN_XSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26844 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xrstor", IX86_BUILTIN_XRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26845 { OPTION_MASK_ISA_XSAVEOPT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt", IX86_BUILTIN_XSAVEOPT, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26847 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxsave64", IX86_BUILTIN_FXSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID },
26848 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxrstor64", IX86_BUILTIN_FXRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID },
26849 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsave64", IX86_BUILTIN_XSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26850 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xrstor64", IX86_BUILTIN_XRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26851 { OPTION_MASK_ISA_XSAVEOPT | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt64", IX86_BUILTIN_XSAVEOPT64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26853 /* SSE */
26854 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storeups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26855 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26856 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
26858 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
26859 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
26860 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
26861 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
26863 /* SSE or 3DNow!A */
26864 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26865 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
26867 /* SSE2 */
26868 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26869 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26870 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storeupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26871 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storedqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
26872 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26873 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
26874 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
26875 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
26876 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
26877 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loaddqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
26879 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
26880 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
26882 /* SSE3 */
26883 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
26885 /* SSE4.1 */
26886 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
26888 /* SSE4A */
26889 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26890 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26892 /* AVX */
26893 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
26894 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
26896 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
26897 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26898 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26899 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
26900 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
26902 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26903 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26904 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26905 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26906 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loaddqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26907 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storedqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
26908 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26910 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
26911 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26912 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26914 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
26915 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
26916 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
26917 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
26918 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
26919 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
26920 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
26921 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
26923 /* AVX2 */
26924 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
26925 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
26926 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
26927 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
26928 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
26929 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
26930 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
26931 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
26932 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
26934 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
26935 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
26936 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
26937 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
26938 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
26939 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
26941 /* FSGSBASE */
26942 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26943 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26944 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26945 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26946 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26947 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26948 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26949 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26951 /* RTM */
26952 { OPTION_MASK_ISA_RTM, CODE_FOR_xbegin, "__builtin_ia32_xbegin", IX86_BUILTIN_XBEGIN, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26953 { OPTION_MASK_ISA_RTM, CODE_FOR_xend, "__builtin_ia32_xend", IX86_BUILTIN_XEND, UNKNOWN, (int) VOID_FTYPE_VOID },
26954 { OPTION_MASK_ISA_RTM, CODE_FOR_xtest, "__builtin_ia32_xtest", IX86_BUILTIN_XTEST, UNKNOWN, (int) INT_FTYPE_VOID },
26957 /* Builtins with variable number of arguments. */
26958 static const struct builtin_description bdesc_args[] =
26960 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
26961 { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
26962 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
26963 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26964 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26965 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26966 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26968 /* MMX */
26969 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26970 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26971 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26972 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26973 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26974 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26976 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26977 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26978 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26979 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26980 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26981 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26982 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26983 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26985 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26986 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26988 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26989 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26990 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26991 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26993 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26994 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26995 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26996 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26997 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26998 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27000 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27001 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27002 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27003 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27004 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
27005 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
27007 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
27008 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
27009 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
27011 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
27013 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27014 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27015 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
27016 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27017 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27018 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
27020 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27021 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27022 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
27023 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27024 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27025 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
27027 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27028 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27029 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27030 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27032 /* 3DNow! */
27033 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
27034 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
27035 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27036 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27038 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27039 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27040 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27041 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27042 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27043 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27044 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27045 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27046 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27047 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27048 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27049 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27050 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27051 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27052 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27054 /* 3DNow!A */
27055 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
27056 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
27057 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
27058 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27059 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27060 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27062 /* SSE */
27063 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
27064 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27065 { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27066 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27067 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27068 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27069 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
27070 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
27071 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
27072 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
27073 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
27074 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
27076 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27078 { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27079 { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27080 { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27081 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27082 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27083 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27084 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27085 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27087 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
27088 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
27089 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
27090 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27091 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27092 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27093 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
27094 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
27095 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
27096 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27097 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
27098 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27099 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
27100 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
27101 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
27102 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27103 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
27104 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
27105 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
27106 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngtss", IX86_BUILTIN_CMPNGTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27107 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngess", IX86_BUILTIN_CMPNGESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27108 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27110 { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27111 { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27112 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27113 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27115 { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27116 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27117 { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27118 { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27120 { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27122 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27123 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27124 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27125 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27126 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27128 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
27129 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
27130 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
27132 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
27134 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27135 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27136 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27138 { OPTION_MASK_ISA_SSE, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
27139 { OPTION_MASK_ISA_SSE, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
27141 /* SSE MMX or 3Dnow!A */
27142 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27143 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27144 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27146 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27147 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27148 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27149 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27151 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
27152 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
27154 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
27156 /* SSE2 */
27157 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27159 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
27160 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
27161 { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
27162 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
27163 { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
27165 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
27166 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
27167 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
27168 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
27169 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
27171 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
27173 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
27174 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
27175 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
27176 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
27178 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2dq, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27179 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
27180 { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27182 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27183 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27184 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27185 { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27186 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27187 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27188 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27189 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27191 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
27192 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
27193 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
27194 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27195 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
27196 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27197 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
27198 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
27199 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
27200 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27201 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27202 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27203 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
27204 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
27205 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
27206 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27207 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
27208 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
27209 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
27210 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27212 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27213 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27214 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27215 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27217 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27218 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27219 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27220 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27222 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27224 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27225 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27226 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27228 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
27230 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27231 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27232 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27233 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27234 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27235 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27236 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27237 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27239 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27240 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27241 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27242 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27243 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27244 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27245 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27246 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27248 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27249 { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
27251 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27252 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27253 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27254 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27256 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27257 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27259 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27260 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27261 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27262 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27263 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27264 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27266 { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27267 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27268 { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27269 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27271 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27272 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27273 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27274 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27275 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27276 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27277 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27278 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27280 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
27281 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
27282 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
27284 { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27285 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
27287 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
27288 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_widen_umult_even_v4si, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
27290 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
27292 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
27293 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
27294 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
27295 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
27297 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
27298 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27299 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27300 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
27301 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27302 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27303 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
27305 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
27306 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27307 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27308 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
27309 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27310 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27311 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
27313 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27314 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27315 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27316 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27318 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
27319 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
27320 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
27322 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
27324 { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27326 /* SSE2 MMX */
27327 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
27328 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
27330 /* SSE3 */
27331 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
27332 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27334 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27335 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27336 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27337 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27338 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27339 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27341 /* SSSE3 */
27342 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
27343 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
27344 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27345 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
27346 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
27347 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
27349 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27350 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27351 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27352 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27353 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27354 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27355 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27356 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27357 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27358 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27359 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27360 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27361 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
27362 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
27363 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27364 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27365 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27366 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27367 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27368 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27369 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27370 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27371 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27372 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27374 /* SSSE3. */
27375 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
27376 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
27378 /* SSE4.1 */
27379 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27380 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27381 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
27382 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
27383 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27384 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27385 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27386 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
27387 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
27388 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
27390 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
27391 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
27392 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
27393 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
27394 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
27395 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
27396 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
27397 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
27398 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
27399 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
27400 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
27401 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
27402 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27404 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
27405 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27406 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27407 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27408 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27409 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27410 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27411 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27412 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27413 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27414 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
27415 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27417 /* SSE4.1 */
27418 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
27419 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
27420 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27421 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27423 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
27424 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
27425 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
27426 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
27428 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
27429 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
27431 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
27432 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
27434 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
27435 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
27436 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
27437 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
27439 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND },
27440 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND },
27442 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27443 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27445 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27446 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27447 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27449 /* SSE4.2 */
27450 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27451 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
27452 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
27453 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27454 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27456 /* SSE4A */
27457 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
27458 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
27459 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
27460 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27462 /* AES */
27463 { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
27464 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27466 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27467 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27468 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27469 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27471 /* PCLMUL */
27472 { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
27474 /* AVX */
27475 { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27476 { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27477 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27478 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27479 { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27480 { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27481 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27482 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27483 { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27484 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27485 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27486 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27487 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27488 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27489 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27490 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27491 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27492 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27493 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27494 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27495 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27496 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27497 { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27498 { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27499 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27500 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27502 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
27503 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
27504 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
27505 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
27507 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27508 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27509 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
27510 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
27511 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27512 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27513 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27514 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27515 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27516 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27517 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27518 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27519 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27520 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
27521 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
27522 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
27523 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
27524 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
27525 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
27526 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2dq256, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27527 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
27528 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
27529 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
27530 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27531 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27532 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27533 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
27534 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
27535 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
27536 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27537 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
27538 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
27539 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
27540 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
27542 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27543 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27544 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27546 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27547 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27548 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27549 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27550 { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27552 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27554 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27555 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
27557 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
27558 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
27559 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
27560 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
27562 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27563 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
27565 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
27566 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
27568 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
27569 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
27570 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
27571 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
27573 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND },
27574 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
27576 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27577 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27579 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27580 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27581 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27582 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27584 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
27585 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
27586 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
27587 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
27588 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
27589 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
27591 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27592 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27593 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27594 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27595 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27596 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27597 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27598 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27599 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27600 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27601 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27602 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27603 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27604 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27605 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27607 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
27608 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
27610 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27611 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27613 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
27615 /* AVX2 */
27616 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
27617 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
27618 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
27619 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
27620 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256", IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
27621 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256", IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
27622 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256", IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
27623 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256", IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
27624 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27625 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27626 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27627 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27628 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27629 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27630 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27631 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27632 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
27633 { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27634 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27635 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27636 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27637 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
27638 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
27639 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27640 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27641 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27642 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27643 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27644 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27645 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27646 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27647 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27648 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27649 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27650 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27651 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27652 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27653 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
27654 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
27655 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27656 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27657 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27658 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27659 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27660 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27661 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27662 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27663 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27664 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27665 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27666 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27667 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
27668 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
27669 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2 , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
27670 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2 , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
27671 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2 , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
27672 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2 , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
27673 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2 , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
27674 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
27675 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2 , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
27676 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2 , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
27677 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
27678 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
27679 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
27680 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_smult_even_v8si, "__builtin_ia32_pmuldq256", IX86_BUILTIN_PMULDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
27681 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27682 { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27683 { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27684 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27685 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27686 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_umult_even_v8si, "__builtin_ia32_pmuludq256", IX86_BUILTIN_PMULUDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
27687 { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27688 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
27689 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27690 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
27691 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
27692 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
27693 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27694 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27695 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27696 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
27697 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27698 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27699 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27700 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27701 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
27702 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
27703 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27704 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27705 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27706 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27707 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
27708 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27709 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27710 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27711 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27712 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
27713 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
27714 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27715 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27716 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27717 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27718 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27719 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27720 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27721 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27722 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27723 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27724 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27725 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27726 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27727 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27728 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27729 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27730 { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27731 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27732 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
27733 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
27734 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
27735 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
27736 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
27737 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
27738 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
27739 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
27740 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
27741 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
27742 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27743 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
27744 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27745 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27746 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
27747 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27748 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
27749 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
27750 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
27751 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
27752 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27753 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27754 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27755 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27756 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27757 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27758 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27759 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27760 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27761 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27763 { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
27765 /* BMI */
27766 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27767 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27768 { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
27770 /* TBM */
27771 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27772 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27774 /* F16C */
27775 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
27776 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
27777 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
27778 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
27780 /* BMI2 */
27781 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27782 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27783 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27784 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27785 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27786 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27789 /* FMA4 and XOP. */
27790 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
27791 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
27792 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
27793 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
27794 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
27795 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
27796 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
27797 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
27798 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
27799 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
27800 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
27801 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
27802 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
27803 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
27804 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
27805 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
27806 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
27807 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
27808 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
27809 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
27810 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
27811 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
27812 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
27813 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
27814 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
27815 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
27816 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
27817 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
27818 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
27819 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
27820 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
27821 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
27822 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
27823 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
27824 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
27825 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
27826 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
27827 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
27828 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
27829 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
27830 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
27831 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
27832 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
27833 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
27834 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
27835 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
27836 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
27837 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
27838 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
27839 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
27840 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
27841 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
27843 static const struct builtin_description bdesc_multi_arg[] =
27845 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
27846 "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
27847 UNKNOWN, (int)MULTI_ARG_3_SF },
27848 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
27849 "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
27850 UNKNOWN, (int)MULTI_ARG_3_DF },
27852 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
27853 "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
27854 UNKNOWN, (int)MULTI_ARG_3_SF },
27855 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
27856 "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
27857 UNKNOWN, (int)MULTI_ARG_3_DF },
27859 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
27860 "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
27861 UNKNOWN, (int)MULTI_ARG_3_SF },
27862 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
27863 "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
27864 UNKNOWN, (int)MULTI_ARG_3_DF },
27865 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
27866 "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
27867 UNKNOWN, (int)MULTI_ARG_3_SF2 },
27868 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
27869 "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
27870 UNKNOWN, (int)MULTI_ARG_3_DF2 },
27872 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
27873 "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
27874 UNKNOWN, (int)MULTI_ARG_3_SF },
27875 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
27876 "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
27877 UNKNOWN, (int)MULTI_ARG_3_DF },
27878 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
27879 "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
27880 UNKNOWN, (int)MULTI_ARG_3_SF2 },
27881 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
27882 "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
27883 UNKNOWN, (int)MULTI_ARG_3_DF2 },
27885 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI },
27886 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI },
27887 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI },
27888 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI },
27889 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI },
27890 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF },
27891 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF },
27893 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
27894 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
27895 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 },
27896 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 },
27897 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 },
27898 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
27899 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
27901 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI },
27903 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
27904 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
27905 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27906 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27907 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27908 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27909 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27910 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27911 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27912 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27913 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27914 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27916 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27917 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI },
27918 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI },
27919 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI },
27920 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM },
27921 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM },
27922 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM },
27923 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM },
27924 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27925 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI },
27926 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI },
27927 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI },
27928 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27929 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI },
27930 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI },
27931 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI },
27933 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_2_SF },
27934 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_2_DF },
27935 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF },
27936 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF },
27937 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 },
27938 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 },
27940 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27941 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27942 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27943 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27944 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27945 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27946 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27947 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27948 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27949 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27950 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27951 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27952 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27953 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27954 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27956 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP },
27957 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27958 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27959 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP },
27960 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP },
27961 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP },
27962 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP },
27964 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP },
27965 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
27966 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
27967 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP },
27968 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP },
27969 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP },
27970 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP },
27972 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP },
27973 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
27974 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
27975 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP },
27976 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP },
27977 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP },
27978 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP },
27980 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP },
27981 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
27982 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
27983 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP },
27984 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP },
27985 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP },
27986 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP },
27988 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP },
27989 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
27990 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
27991 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP },
27992 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP },
27993 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP },
27994 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP },
27996 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP },
27997 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
27998 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
27999 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP },
28000 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP },
28001 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP },
28002 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP },
28004 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP },
28005 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
28006 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
28007 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP },
28008 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP },
28009 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP },
28010 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP },
28012 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP },
28013 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
28014 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
28015 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP },
28016 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP },
28017 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP },
28018 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP },
28020 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
28021 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
28022 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
28023 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
28024 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
28025 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
28026 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
28027 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
28029 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
28030 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
28031 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
28032 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
28033 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
28034 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
28035 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
28036 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
28038 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
28039 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
28040 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
28041 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
28045 /* TM vector builtins. */
28047 /* Reuse the existing x86-specific `struct builtin_description' cause
28048 we're lazy. Add casts to make them fit. */
28049 static const struct builtin_description bdesc_tm[] =
28051 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28052 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28053 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28054 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28055 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28056 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28057 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28059 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28060 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28061 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28062 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28063 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28064 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28065 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28067 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28068 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28069 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28070 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28071 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28072 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28073 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28075 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
28076 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
28077 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
28080 /* TM callbacks. */
28082 /* Return the builtin decl needed to load a vector of TYPE. */
28084 static tree
28085 ix86_builtin_tm_load (tree type)
28087 if (TREE_CODE (type) == VECTOR_TYPE)
28089 switch (tree_low_cst (TYPE_SIZE (type), 1))
28091 case 64:
28092 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
28093 case 128:
28094 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
28095 case 256:
28096 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
28099 return NULL_TREE;
28102 /* Return the builtin decl needed to store a vector of TYPE. */
28104 static tree
28105 ix86_builtin_tm_store (tree type)
28107 if (TREE_CODE (type) == VECTOR_TYPE)
28109 switch (tree_low_cst (TYPE_SIZE (type), 1))
28111 case 64:
28112 return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
28113 case 128:
28114 return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
28115 case 256:
28116 return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
28119 return NULL_TREE;
28122 /* Initialize the transactional memory vector load/store builtins. */
28124 static void
28125 ix86_init_tm_builtins (void)
28127 enum ix86_builtin_func_type ftype;
28128 const struct builtin_description *d;
28129 size_t i;
28130 tree decl;
28131 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
28132 tree attrs_log, attrs_type_log;
28134 if (!flag_tm)
28135 return;
28137 /* If there are no builtins defined, we must be compiling in a
28138 language without trans-mem support. */
28139 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
28140 return;
28142 /* Use whatever attributes a normal TM load has. */
28143 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
28144 attrs_load = DECL_ATTRIBUTES (decl);
28145 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28146 /* Use whatever attributes a normal TM store has. */
28147 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
28148 attrs_store = DECL_ATTRIBUTES (decl);
28149 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28150 /* Use whatever attributes a normal TM log has. */
28151 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
28152 attrs_log = DECL_ATTRIBUTES (decl);
28153 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28155 for (i = 0, d = bdesc_tm;
28156 i < ARRAY_SIZE (bdesc_tm);
28157 i++, d++)
28159 if ((d->mask & ix86_isa_flags) != 0
28160 || (lang_hooks.builtin_function
28161 == lang_hooks.builtin_function_ext_scope))
28163 tree type, attrs, attrs_type;
28164 enum built_in_function code = (enum built_in_function) d->code;
28166 ftype = (enum ix86_builtin_func_type) d->flag;
28167 type = ix86_get_builtin_func_type (ftype);
28169 if (BUILTIN_TM_LOAD_P (code))
28171 attrs = attrs_load;
28172 attrs_type = attrs_type_load;
28174 else if (BUILTIN_TM_STORE_P (code))
28176 attrs = attrs_store;
28177 attrs_type = attrs_type_store;
28179 else
28181 attrs = attrs_log;
28182 attrs_type = attrs_type_log;
28184 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
28185 /* The builtin without the prefix for
28186 calling it directly. */
28187 d->name + strlen ("__builtin_"),
28188 attrs);
28189 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
28190 set the TYPE_ATTRIBUTES. */
28191 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
28193 set_builtin_decl (code, decl, false);
28198 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
28199 in the current target ISA to allow the user to compile particular modules
28200 with different target specific options that differ from the command line
28201 options. */
28202 static void
28203 ix86_init_mmx_sse_builtins (void)
28205 const struct builtin_description * d;
28206 enum ix86_builtin_func_type ftype;
28207 size_t i;
28209 /* Add all special builtins with variable number of operands. */
28210 for (i = 0, d = bdesc_special_args;
28211 i < ARRAY_SIZE (bdesc_special_args);
28212 i++, d++)
28214 if (d->name == 0)
28215 continue;
28217 ftype = (enum ix86_builtin_func_type) d->flag;
28218 def_builtin (d->mask, d->name, ftype, d->code);
28221 /* Add all builtins with variable number of operands. */
28222 for (i = 0, d = bdesc_args;
28223 i < ARRAY_SIZE (bdesc_args);
28224 i++, d++)
28226 if (d->name == 0)
28227 continue;
28229 ftype = (enum ix86_builtin_func_type) d->flag;
28230 def_builtin_const (d->mask, d->name, ftype, d->code);
28233 /* pcmpestr[im] insns. */
28234 for (i = 0, d = bdesc_pcmpestr;
28235 i < ARRAY_SIZE (bdesc_pcmpestr);
28236 i++, d++)
28238 if (d->code == IX86_BUILTIN_PCMPESTRM128)
28239 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
28240 else
28241 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
28242 def_builtin_const (d->mask, d->name, ftype, d->code);
28245 /* pcmpistr[im] insns. */
28246 for (i = 0, d = bdesc_pcmpistr;
28247 i < ARRAY_SIZE (bdesc_pcmpistr);
28248 i++, d++)
28250 if (d->code == IX86_BUILTIN_PCMPISTRM128)
28251 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
28252 else
28253 ftype = INT_FTYPE_V16QI_V16QI_INT;
28254 def_builtin_const (d->mask, d->name, ftype, d->code);
28257 /* comi/ucomi insns. */
28258 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
28260 if (d->mask == OPTION_MASK_ISA_SSE2)
28261 ftype = INT_FTYPE_V2DF_V2DF;
28262 else
28263 ftype = INT_FTYPE_V4SF_V4SF;
28264 def_builtin_const (d->mask, d->name, ftype, d->code);
28267 /* SSE */
28268 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
28269 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
28270 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
28271 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
28273 /* SSE or 3DNow!A */
28274 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28275 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
28276 IX86_BUILTIN_MASKMOVQ);
28278 /* SSE2 */
28279 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
28280 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
28282 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
28283 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
28284 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
28285 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
28287 /* SSE3. */
28288 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
28289 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
28290 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
28291 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
28293 /* AES */
28294 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
28295 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
28296 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
28297 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
28298 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
28299 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
28300 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
28301 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
28302 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
28303 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
28304 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
28305 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
28307 /* PCLMUL */
28308 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
28309 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
28311 /* RDRND */
28312 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
28313 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
28314 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
28315 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
28316 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
28317 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
28318 IX86_BUILTIN_RDRAND64_STEP);
28320 /* AVX2 */
28321 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
28322 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
28323 IX86_BUILTIN_GATHERSIV2DF);
28325 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
28326 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
28327 IX86_BUILTIN_GATHERSIV4DF);
28329 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
28330 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
28331 IX86_BUILTIN_GATHERDIV2DF);
28333 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
28334 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
28335 IX86_BUILTIN_GATHERDIV4DF);
28337 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
28338 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
28339 IX86_BUILTIN_GATHERSIV4SF);
28341 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
28342 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
28343 IX86_BUILTIN_GATHERSIV8SF);
28345 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
28346 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
28347 IX86_BUILTIN_GATHERDIV4SF);
28349 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
28350 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
28351 IX86_BUILTIN_GATHERDIV8SF);
28353 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
28354 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
28355 IX86_BUILTIN_GATHERSIV2DI);
28357 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
28358 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
28359 IX86_BUILTIN_GATHERSIV4DI);
28361 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
28362 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
28363 IX86_BUILTIN_GATHERDIV2DI);
28365 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
28366 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
28367 IX86_BUILTIN_GATHERDIV4DI);
28369 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
28370 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
28371 IX86_BUILTIN_GATHERSIV4SI);
28373 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
28374 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
28375 IX86_BUILTIN_GATHERSIV8SI);
28377 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
28378 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
28379 IX86_BUILTIN_GATHERDIV4SI);
28381 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
28382 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
28383 IX86_BUILTIN_GATHERDIV8SI);
28385 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
28386 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
28387 IX86_BUILTIN_GATHERALTSIV4DF);
28389 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
28390 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
28391 IX86_BUILTIN_GATHERALTDIV8SF);
28393 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
28394 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
28395 IX86_BUILTIN_GATHERALTSIV4DI);
28397 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
28398 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
28399 IX86_BUILTIN_GATHERALTDIV8SI);
28401 /* RTM. */
28402 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
28403 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
28405 /* MMX access to the vec_init patterns. */
28406 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
28407 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
28409 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
28410 V4HI_FTYPE_HI_HI_HI_HI,
28411 IX86_BUILTIN_VEC_INIT_V4HI);
28413 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
28414 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
28415 IX86_BUILTIN_VEC_INIT_V8QI);
28417 /* Access to the vec_extract patterns. */
28418 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
28419 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
28420 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
28421 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
28422 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
28423 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
28424 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
28425 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
28426 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
28427 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
28429 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28430 "__builtin_ia32_vec_ext_v4hi",
28431 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
28433 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
28434 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
28436 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
28437 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
28439 /* Access to the vec_set patterns. */
28440 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
28441 "__builtin_ia32_vec_set_v2di",
28442 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
28444 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
28445 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
28447 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
28448 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
28450 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
28451 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
28453 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28454 "__builtin_ia32_vec_set_v4hi",
28455 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
28457 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
28458 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
28460 /* RDSEED */
28461 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
28462 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
28463 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
28464 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
28465 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
28466 "__builtin_ia32_rdseed_di_step",
28467 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
28469 /* ADCX */
28470 def_builtin (0, "__builtin_ia32_addcarryx_u32",
28471 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
28472 def_builtin (OPTION_MASK_ISA_64BIT,
28473 "__builtin_ia32_addcarryx_u64",
28474 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
28475 IX86_BUILTIN_ADDCARRYX64);
28477 /* Add FMA4 multi-arg argument instructions */
28478 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
28480 if (d->name == 0)
28481 continue;
28483 ftype = (enum ix86_builtin_func_type) d->flag;
28484 def_builtin_const (d->mask, d->name, ftype, d->code);
28488 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
28489 to return a pointer to VERSION_DECL if the outcome of the expression
28490 formed by PREDICATE_CHAIN is true. This function will be called during
28491 version dispatch to decide which function version to execute. It returns
28492 the basic block at the end, to which more conditions can be added. */
28494 static basic_block
28495 add_condition_to_bb (tree function_decl, tree version_decl,
28496 tree predicate_chain, basic_block new_bb)
28498 gimple return_stmt;
28499 tree convert_expr, result_var;
28500 gimple convert_stmt;
28501 gimple call_cond_stmt;
28502 gimple if_else_stmt;
28504 basic_block bb1, bb2, bb3;
28505 edge e12, e23;
28507 tree cond_var, and_expr_var = NULL_TREE;
28508 gimple_seq gseq;
28510 tree predicate_decl, predicate_arg;
28512 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
28514 gcc_assert (new_bb != NULL);
28515 gseq = bb_seq (new_bb);
28518 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
28519 build_fold_addr_expr (version_decl));
28520 result_var = create_tmp_var (ptr_type_node, NULL);
28521 convert_stmt = gimple_build_assign (result_var, convert_expr);
28522 return_stmt = gimple_build_return (result_var);
28524 if (predicate_chain == NULL_TREE)
28526 gimple_seq_add_stmt (&gseq, convert_stmt);
28527 gimple_seq_add_stmt (&gseq, return_stmt);
28528 set_bb_seq (new_bb, gseq);
28529 gimple_set_bb (convert_stmt, new_bb);
28530 gimple_set_bb (return_stmt, new_bb);
28531 pop_cfun ();
28532 return new_bb;
28535 while (predicate_chain != NULL)
28537 cond_var = create_tmp_var (integer_type_node, NULL);
28538 predicate_decl = TREE_PURPOSE (predicate_chain);
28539 predicate_arg = TREE_VALUE (predicate_chain);
28540 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
28541 gimple_call_set_lhs (call_cond_stmt, cond_var);
28543 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
28544 gimple_set_bb (call_cond_stmt, new_bb);
28545 gimple_seq_add_stmt (&gseq, call_cond_stmt);
28547 predicate_chain = TREE_CHAIN (predicate_chain);
28549 if (and_expr_var == NULL)
28550 and_expr_var = cond_var;
28551 else
28553 gimple assign_stmt;
28554 /* Use MIN_EXPR to check if any integer is zero?.
28555 and_expr_var = min_expr <cond_var, and_expr_var> */
28556 assign_stmt = gimple_build_assign (and_expr_var,
28557 build2 (MIN_EXPR, integer_type_node,
28558 cond_var, and_expr_var));
28560 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
28561 gimple_set_bb (assign_stmt, new_bb);
28562 gimple_seq_add_stmt (&gseq, assign_stmt);
28566 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
28567 integer_zero_node,
28568 NULL_TREE, NULL_TREE);
28569 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
28570 gimple_set_bb (if_else_stmt, new_bb);
28571 gimple_seq_add_stmt (&gseq, if_else_stmt);
28573 gimple_seq_add_stmt (&gseq, convert_stmt);
28574 gimple_seq_add_stmt (&gseq, return_stmt);
28575 set_bb_seq (new_bb, gseq);
28577 bb1 = new_bb;
28578 e12 = split_block (bb1, if_else_stmt);
28579 bb2 = e12->dest;
28580 e12->flags &= ~EDGE_FALLTHRU;
28581 e12->flags |= EDGE_TRUE_VALUE;
28583 e23 = split_block (bb2, return_stmt);
28585 gimple_set_bb (convert_stmt, bb2);
28586 gimple_set_bb (return_stmt, bb2);
28588 bb3 = e23->dest;
28589 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
28591 remove_edge (e23);
28592 make_edge (bb2, EXIT_BLOCK_PTR, 0);
28594 pop_cfun ();
28596 return bb3;
28599 /* This parses the attribute arguments to target in DECL and determines
28600 the right builtin to use to match the platform specification.
28601 It returns the priority value for this version decl. If PREDICATE_LIST
28602 is not NULL, it stores the list of cpu features that need to be checked
28603 before dispatching this function. */
28605 static unsigned int
28606 get_builtin_code_for_version (tree decl, tree *predicate_list)
28608 tree attrs;
28609 struct cl_target_option cur_target;
28610 tree target_node;
28611 struct cl_target_option *new_target;
28612 const char *arg_str = NULL;
28613 const char *attrs_str = NULL;
28614 char *tok_str = NULL;
28615 char *token;
28617 /* Priority of i386 features, greater value is higher priority. This is
28618 used to decide the order in which function dispatch must happen. For
28619 instance, a version specialized for SSE4.2 should be checked for dispatch
28620 before a version for SSE3, as SSE4.2 implies SSE3. */
28621 enum feature_priority
28623 P_ZERO = 0,
28624 P_MMX,
28625 P_SSE,
28626 P_SSE2,
28627 P_SSE3,
28628 P_SSSE3,
28629 P_PROC_SSSE3,
28630 P_SSE4_a,
28631 P_PROC_SSE4_a,
28632 P_SSE4_1,
28633 P_SSE4_2,
28634 P_PROC_SSE4_2,
28635 P_POPCNT,
28636 P_AVX,
28637 P_AVX2,
28638 P_FMA,
28639 P_PROC_FMA
28642 enum feature_priority priority = P_ZERO;
28644 /* These are the target attribute strings for which a dispatcher is
28645 available, from fold_builtin_cpu. */
28647 static struct _feature_list
28649 const char *const name;
28650 const enum feature_priority priority;
28652 const feature_list[] =
28654 {"mmx", P_MMX},
28655 {"sse", P_SSE},
28656 {"sse2", P_SSE2},
28657 {"sse3", P_SSE3},
28658 {"ssse3", P_SSSE3},
28659 {"sse4.1", P_SSE4_1},
28660 {"sse4.2", P_SSE4_2},
28661 {"popcnt", P_POPCNT},
28662 {"avx", P_AVX},
28663 {"avx2", P_AVX2}
28667 static unsigned int NUM_FEATURES
28668 = sizeof (feature_list) / sizeof (struct _feature_list);
28670 unsigned int i;
28672 tree predicate_chain = NULL_TREE;
28673 tree predicate_decl, predicate_arg;
28675 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
28676 gcc_assert (attrs != NULL);
28678 attrs = TREE_VALUE (TREE_VALUE (attrs));
28680 gcc_assert (TREE_CODE (attrs) == STRING_CST);
28681 attrs_str = TREE_STRING_POINTER (attrs);
28684 /* Handle arch= if specified. For priority, set it to be 1 more than
28685 the best instruction set the processor can handle. For instance, if
28686 there is a version for atom and a version for ssse3 (the highest ISA
28687 priority for atom), the atom version must be checked for dispatch
28688 before the ssse3 version. */
28689 if (strstr (attrs_str, "arch=") != NULL)
28691 cl_target_option_save (&cur_target, &global_options);
28692 target_node = ix86_valid_target_attribute_tree (attrs);
28694 gcc_assert (target_node);
28695 new_target = TREE_TARGET_OPTION (target_node);
28696 gcc_assert (new_target);
28698 if (new_target->arch_specified && new_target->arch > 0)
28700 switch (new_target->arch)
28702 case PROCESSOR_CORE2:
28703 arg_str = "core2";
28704 priority = P_PROC_SSSE3;
28705 break;
28706 case PROCESSOR_COREI7:
28707 arg_str = "corei7";
28708 priority = P_PROC_SSE4_2;
28709 break;
28710 case PROCESSOR_ATOM:
28711 arg_str = "atom";
28712 priority = P_PROC_SSSE3;
28713 break;
28714 case PROCESSOR_AMDFAM10:
28715 arg_str = "amdfam10h";
28716 priority = P_PROC_SSE4_a;
28717 break;
28718 case PROCESSOR_BDVER1:
28719 arg_str = "bdver1";
28720 priority = P_PROC_FMA;
28721 break;
28722 case PROCESSOR_BDVER2:
28723 arg_str = "bdver2";
28724 priority = P_PROC_FMA;
28725 break;
28729 cl_target_option_restore (&global_options, &cur_target);
28731 if (predicate_list && arg_str == NULL)
28733 error_at (DECL_SOURCE_LOCATION (decl),
28734 "No dispatcher found for the versioning attributes");
28735 return 0;
28738 if (predicate_list)
28740 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
28741 /* For a C string literal the length includes the trailing NULL. */
28742 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
28743 predicate_chain = tree_cons (predicate_decl, predicate_arg,
28744 predicate_chain);
28748 /* Process feature name. */
28749 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
28750 strcpy (tok_str, attrs_str);
28751 token = strtok (tok_str, ",");
28752 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
28754 while (token != NULL)
28756 /* Do not process "arch=" */
28757 if (strncmp (token, "arch=", 5) == 0)
28759 token = strtok (NULL, ",");
28760 continue;
28762 for (i = 0; i < NUM_FEATURES; ++i)
28764 if (strcmp (token, feature_list[i].name) == 0)
28766 if (predicate_list)
28768 predicate_arg = build_string_literal (
28769 strlen (feature_list[i].name) + 1,
28770 feature_list[i].name);
28771 predicate_chain = tree_cons (predicate_decl, predicate_arg,
28772 predicate_chain);
28774 /* Find the maximum priority feature. */
28775 if (feature_list[i].priority > priority)
28776 priority = feature_list[i].priority;
28778 break;
28781 if (predicate_list && i == NUM_FEATURES)
28783 error_at (DECL_SOURCE_LOCATION (decl),
28784 "No dispatcher found for %s", token);
28785 return 0;
28787 token = strtok (NULL, ",");
28789 free (tok_str);
28791 if (predicate_list && predicate_chain == NULL_TREE)
28793 error_at (DECL_SOURCE_LOCATION (decl),
28794 "No dispatcher found for the versioning attributes : %s",
28795 attrs_str);
28796 return 0;
28798 else if (predicate_list)
28800 predicate_chain = nreverse (predicate_chain);
28801 *predicate_list = predicate_chain;
28804 return priority;
28807 /* This compares the priority of target features in function DECL1
28808 and DECL2. It returns positive value if DECL1 is higher priority,
28809 negative value if DECL2 is higher priority and 0 if they are the
28810 same. */
28812 static int
28813 ix86_compare_version_priority (tree decl1, tree decl2)
28815 unsigned int priority1 = 0;
28816 unsigned int priority2 = 0;
28818 if (lookup_attribute ("target", DECL_ATTRIBUTES (decl1)) != NULL)
28819 priority1 = get_builtin_code_for_version (decl1, NULL);
28821 if (lookup_attribute ("target", DECL_ATTRIBUTES (decl2)) != NULL)
28822 priority2 = get_builtin_code_for_version (decl2, NULL);
28824 return (int)priority1 - (int)priority2;
28827 /* V1 and V2 point to function versions with different priorities
28828 based on the target ISA. This function compares their priorities. */
28830 static int
28831 feature_compare (const void *v1, const void *v2)
28833 typedef struct _function_version_info
28835 tree version_decl;
28836 tree predicate_chain;
28837 unsigned int dispatch_priority;
28838 } function_version_info;
28840 const function_version_info c1 = *(const function_version_info *)v1;
28841 const function_version_info c2 = *(const function_version_info *)v2;
28842 return (c2.dispatch_priority - c1.dispatch_priority);
28845 /* This function generates the dispatch function for
28846 multi-versioned functions. DISPATCH_DECL is the function which will
28847 contain the dispatch logic. FNDECLS are the function choices for
28848 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
28849 in DISPATCH_DECL in which the dispatch code is generated. */
28851 static int
28852 dispatch_function_versions (tree dispatch_decl,
28853 void *fndecls_p,
28854 basic_block *empty_bb)
28856 tree default_decl;
28857 gimple ifunc_cpu_init_stmt;
28858 gimple_seq gseq;
28859 int ix;
28860 tree ele;
28861 vec<tree> *fndecls;
28862 unsigned int num_versions = 0;
28863 unsigned int actual_versions = 0;
28864 unsigned int i;
28866 struct _function_version_info
28868 tree version_decl;
28869 tree predicate_chain;
28870 unsigned int dispatch_priority;
28871 }*function_version_info;
28873 gcc_assert (dispatch_decl != NULL
28874 && fndecls_p != NULL
28875 && empty_bb != NULL);
28877 /*fndecls_p is actually a vector. */
28878 fndecls = static_cast<vec<tree> *> (fndecls_p);
28880 /* At least one more version other than the default. */
28881 num_versions = fndecls->length ();
28882 gcc_assert (num_versions >= 2);
28884 function_version_info = (struct _function_version_info *)
28885 XNEWVEC (struct _function_version_info, (num_versions - 1));
28887 /* The first version in the vector is the default decl. */
28888 default_decl = (*fndecls)[0];
28890 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
28892 gseq = bb_seq (*empty_bb);
28893 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
28894 constructors, so explicity call __builtin_cpu_init here. */
28895 ifunc_cpu_init_stmt = gimple_build_call_vec (
28896 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
28897 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
28898 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
28899 set_bb_seq (*empty_bb, gseq);
28901 pop_cfun ();
28904 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
28906 tree version_decl = ele;
28907 tree predicate_chain = NULL_TREE;
28908 unsigned int priority;
28909 /* Get attribute string, parse it and find the right predicate decl.
28910 The predicate function could be a lengthy combination of many
28911 features, like arch-type and various isa-variants. */
28912 priority = get_builtin_code_for_version (version_decl,
28913 &predicate_chain);
28915 if (predicate_chain == NULL_TREE)
28916 continue;
28918 actual_versions++;
28919 function_version_info [ix - 1].version_decl = version_decl;
28920 function_version_info [ix - 1].predicate_chain = predicate_chain;
28921 function_version_info [ix - 1].dispatch_priority = priority;
28924 /* Sort the versions according to descending order of dispatch priority. The
28925 priority is based on the ISA. This is not a perfect solution. There
28926 could still be ambiguity. If more than one function version is suitable
28927 to execute, which one should be dispatched? In future, allow the user
28928 to specify a dispatch priority next to the version. */
28929 qsort (function_version_info, actual_versions,
28930 sizeof (struct _function_version_info), feature_compare);
28932 for (i = 0; i < actual_versions; ++i)
28933 *empty_bb = add_condition_to_bb (dispatch_decl,
28934 function_version_info[i].version_decl,
28935 function_version_info[i].predicate_chain,
28936 *empty_bb);
28938 /* dispatch default version at the end. */
28939 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
28940 NULL, *empty_bb);
28942 free (function_version_info);
28943 return 0;
28946 /* Comparator function to be used in qsort routine to sort attribute
28947 specification strings to "target". */
28949 static int
28950 attr_strcmp (const void *v1, const void *v2)
28952 const char *c1 = *(char *const*)v1;
28953 const char *c2 = *(char *const*)v2;
28954 return strcmp (c1, c2);
28957 /* STR is the argument to target attribute. This function tokenizes
28958 the comma separated arguments, sorts them and returns a string which
28959 is a unique identifier for the comma separated arguments. It also
28960 replaces non-identifier characters "=,-" with "_". */
28962 static char *
28963 sorted_attr_string (const char *str)
28965 char **args = NULL;
28966 char *attr_str, *ret_str;
28967 char *attr = NULL;
28968 unsigned int argnum = 1;
28969 unsigned int i;
28971 for (i = 0; i < strlen (str); i++)
28972 if (str[i] == ',')
28973 argnum++;
28975 attr_str = (char *)xmalloc (strlen (str) + 1);
28976 strcpy (attr_str, str);
28978 /* Replace "=,-" with "_". */
28979 for (i = 0; i < strlen (attr_str); i++)
28980 if (attr_str[i] == '=' || attr_str[i]== '-')
28981 attr_str[i] = '_';
28983 if (argnum == 1)
28984 return attr_str;
28986 args = XNEWVEC (char *, argnum);
28988 i = 0;
28989 attr = strtok (attr_str, ",");
28990 while (attr != NULL)
28992 args[i] = attr;
28993 i++;
28994 attr = strtok (NULL, ",");
28997 qsort (args, argnum, sizeof (char*), attr_strcmp);
28999 ret_str = (char *)xmalloc (strlen (str) + 1);
29000 strcpy (ret_str, args[0]);
29001 for (i = 1; i < argnum; i++)
29003 strcat (ret_str, "_");
29004 strcat (ret_str, args[i]);
29007 free (args);
29008 free (attr_str);
29009 return ret_str;
29012 /* This function changes the assembler name for functions that are
29013 versions. If DECL is a function version and has a "target"
29014 attribute, it appends the attribute string to its assembler name. */
29016 static tree
29017 ix86_mangle_function_version_assembler_name (tree decl, tree id)
29019 tree version_attr;
29020 const char *orig_name, *version_string, *attr_str;
29021 char *assembler_name;
29023 if (DECL_DECLARED_INLINE_P (decl)
29024 && lookup_attribute ("gnu_inline",
29025 DECL_ATTRIBUTES (decl)))
29026 error_at (DECL_SOURCE_LOCATION (decl),
29027 "Function versions cannot be marked as gnu_inline,"
29028 " bodies have to be generated");
29030 if (DECL_VIRTUAL_P (decl)
29031 || DECL_VINDEX (decl))
29032 error_at (DECL_SOURCE_LOCATION (decl),
29033 "Virtual function versioning not supported\n");
29035 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
29037 /* target attribute string is NULL for default functions. */
29038 if (version_attr == NULL_TREE)
29039 return id;
29041 orig_name = IDENTIFIER_POINTER (id);
29042 version_string
29043 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
29045 attr_str = sorted_attr_string (version_string);
29046 assembler_name = (char *) xmalloc (strlen (orig_name)
29047 + strlen (attr_str) + 2);
29049 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
29051 /* Allow assembler name to be modified if already set. */
29052 if (DECL_ASSEMBLER_NAME_SET_P (decl))
29053 SET_DECL_RTL (decl, NULL);
29055 return get_identifier (assembler_name);
29058 /* This function returns true if FN1 and FN2 are versions of the same function,
29059 that is, the target strings of the function decls are different. This assumes
29060 that FN1 and FN2 have the same signature. */
29062 static bool
29063 ix86_function_versions (tree fn1, tree fn2)
29065 tree attr1, attr2;
29066 const char *attr_str1, *attr_str2;
29067 char *target1, *target2;
29068 bool result;
29070 if (TREE_CODE (fn1) != FUNCTION_DECL
29071 || TREE_CODE (fn2) != FUNCTION_DECL)
29072 return false;
29074 attr1 = lookup_attribute ("target", DECL_ATTRIBUTES (fn1));
29075 attr2 = lookup_attribute ("target", DECL_ATTRIBUTES (fn2));
29077 /* At least one function decl should have the target attribute specified. */
29078 if (attr1 == NULL_TREE && attr2 == NULL_TREE)
29079 return false;
29081 /* If one function does not have a target attribute, these are versions. */
29082 if (attr1 == NULL_TREE || attr2 == NULL_TREE)
29083 return true;
29085 attr_str1 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (attr1)));
29086 attr_str2 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (attr2)));
29088 target1 = sorted_attr_string (attr_str1);
29089 target2 = sorted_attr_string (attr_str2);
29091 /* The sorted target strings must be different for fn1 and fn2
29092 to be versions. */
29093 if (strcmp (target1, target2) == 0)
29094 result = false;
29095 else
29096 result = true;
29098 free (target1);
29099 free (target2);
29101 return result;
29104 /* This target supports function multiversioning. */
29106 static bool
29107 ix86_supports_function_versions (void)
29109 return true;
29112 static tree
29113 ix86_mangle_decl_assembler_name (tree decl, tree id)
29115 /* For function version, add the target suffix to the assembler name. */
29116 if (TREE_CODE (decl) == FUNCTION_DECL
29117 && DECL_FUNCTION_VERSIONED (decl))
29118 id = ix86_mangle_function_version_assembler_name (decl, id);
29119 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
29120 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
29121 #endif
29123 return id;
29126 /* Return a new name by appending SUFFIX to the DECL name. If make_unique
29127 is true, append the full path name of the source file. */
29129 static char *
29130 make_name (tree decl, const char *suffix, bool make_unique)
29132 char *global_var_name;
29133 int name_len;
29134 const char *name;
29135 const char *unique_name = NULL;
29137 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
29139 /* Get a unique name that can be used globally without any chances
29140 of collision at link time. */
29141 if (make_unique)
29142 unique_name = IDENTIFIER_POINTER (get_file_function_name ("\0"));
29144 name_len = strlen (name) + strlen (suffix) + 2;
29146 if (make_unique)
29147 name_len += strlen (unique_name) + 1;
29148 global_var_name = XNEWVEC (char, name_len);
29150 /* Use '.' to concatenate names as it is demangler friendly. */
29151 if (make_unique)
29152 snprintf (global_var_name, name_len, "%s.%s.%s", name,
29153 unique_name, suffix);
29154 else
29155 snprintf (global_var_name, name_len, "%s.%s", name, suffix);
29157 return global_var_name;
29160 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE) && HAVE_GNU_INDIRECT_FUNCTION
29162 /* Make a dispatcher declaration for the multi-versioned function DECL.
29163 Calls to DECL function will be replaced with calls to the dispatcher
29164 by the front-end. Return the decl created. */
29166 static tree
29167 make_dispatcher_decl (const tree decl)
29169 tree func_decl;
29170 char *func_name, *resolver_name;
29171 tree fn_type, func_type;
29172 bool is_uniq = false;
29174 if (TREE_PUBLIC (decl) == 0)
29175 is_uniq = true;
29177 func_name = make_name (decl, "ifunc", is_uniq);
29178 resolver_name = make_name (decl, "resolver", is_uniq);
29179 gcc_assert (resolver_name);
29181 fn_type = TREE_TYPE (decl);
29182 func_type = build_function_type (TREE_TYPE (fn_type),
29183 TYPE_ARG_TYPES (fn_type));
29185 func_decl = build_fn_decl (func_name, func_type);
29186 TREE_USED (func_decl) = 1;
29187 DECL_CONTEXT (func_decl) = NULL_TREE;
29188 DECL_INITIAL (func_decl) = error_mark_node;
29189 DECL_ARTIFICIAL (func_decl) = 1;
29190 /* Mark this func as external, the resolver will flip it again if
29191 it gets generated. */
29192 DECL_EXTERNAL (func_decl) = 1;
29193 /* This will be of type IFUNCs have to be externally visible. */
29194 TREE_PUBLIC (func_decl) = 1;
29196 return func_decl;
29199 #endif
29201 /* Returns true if decl is multi-versioned and DECL is the default function,
29202 that is it is not tagged with target specific optimization. */
29204 static bool
29205 is_function_default_version (const tree decl)
29207 return (TREE_CODE (decl) == FUNCTION_DECL
29208 && DECL_FUNCTION_VERSIONED (decl)
29209 && lookup_attribute ("target", DECL_ATTRIBUTES (decl)) == NULL_TREE);
29212 /* Make a dispatcher declaration for the multi-versioned function DECL.
29213 Calls to DECL function will be replaced with calls to the dispatcher
29214 by the front-end. Returns the decl of the dispatcher function. */
29216 static tree
29217 ix86_get_function_versions_dispatcher (void *decl)
29219 tree fn = (tree) decl;
29220 struct cgraph_node *node = NULL;
29221 struct cgraph_node *default_node = NULL;
29222 struct cgraph_function_version_info *node_v = NULL;
29223 struct cgraph_function_version_info *first_v = NULL;
29225 tree dispatch_decl = NULL;
29227 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE) && HAVE_GNU_INDIRECT_FUNCTION
29228 struct cgraph_function_version_info *it_v = NULL;
29229 struct cgraph_node *dispatcher_node = NULL;
29230 struct cgraph_function_version_info *dispatcher_version_info = NULL;
29231 #endif
29233 struct cgraph_function_version_info *default_version_info = NULL;
29235 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
29237 node = cgraph_get_node (fn);
29238 gcc_assert (node != NULL);
29240 node_v = get_cgraph_node_version (node);
29241 gcc_assert (node_v != NULL);
29243 if (node_v->dispatcher_resolver != NULL)
29244 return node_v->dispatcher_resolver;
29246 /* Find the default version and make it the first node. */
29247 first_v = node_v;
29248 /* Go to the beginnig of the chain. */
29249 while (first_v->prev != NULL)
29250 first_v = first_v->prev;
29251 default_version_info = first_v;
29252 while (default_version_info != NULL)
29254 if (is_function_default_version
29255 (default_version_info->this_node->symbol.decl))
29256 break;
29257 default_version_info = default_version_info->next;
29260 /* If there is no default node, just return NULL. */
29261 if (default_version_info == NULL)
29262 return NULL;
29264 /* Make default info the first node. */
29265 if (first_v != default_version_info)
29267 default_version_info->prev->next = default_version_info->next;
29268 if (default_version_info->next)
29269 default_version_info->next->prev = default_version_info->prev;
29270 first_v->prev = default_version_info;
29271 default_version_info->next = first_v;
29272 default_version_info->prev = NULL;
29275 default_node = default_version_info->this_node;
29277 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE) && HAVE_GNU_INDIRECT_FUNCTION
29278 /* Right now, the dispatching is done via ifunc. */
29279 dispatch_decl = make_dispatcher_decl (default_node->symbol.decl);
29281 dispatcher_node = cgraph_get_create_node (dispatch_decl);
29282 gcc_assert (dispatcher_node != NULL);
29283 dispatcher_node->dispatcher_function = 1;
29284 dispatcher_version_info
29285 = insert_new_cgraph_node_version (dispatcher_node);
29286 dispatcher_version_info->next = default_version_info;
29287 dispatcher_node->local.finalized = 1;
29289 /* Set the dispatcher for all the versions. */
29290 it_v = default_version_info;
29291 while (it_v != NULL)
29293 it_v->dispatcher_resolver = dispatch_decl;
29294 it_v = it_v->next;
29296 #else
29297 error_at (DECL_SOURCE_LOCATION (default_node->symbol.decl),
29298 "multiversioning needs ifunc which is not supported "
29299 "in this configuration");
29300 #endif
29301 return dispatch_decl;
29304 /* Makes a function attribute of the form NAME(ARG_NAME) and chains
29305 it to CHAIN. */
29307 static tree
29308 make_attribute (const char *name, const char *arg_name, tree chain)
29310 tree attr_name;
29311 tree attr_arg_name;
29312 tree attr_args;
29313 tree attr;
29315 attr_name = get_identifier (name);
29316 attr_arg_name = build_string (strlen (arg_name), arg_name);
29317 attr_args = tree_cons (NULL_TREE, attr_arg_name, NULL_TREE);
29318 attr = tree_cons (attr_name, attr_args, chain);
29319 return attr;
29322 /* Make the resolver function decl to dispatch the versions of
29323 a multi-versioned function, DEFAULT_DECL. Create an
29324 empty basic block in the resolver and store the pointer in
29325 EMPTY_BB. Return the decl of the resolver function. */
29327 static tree
29328 make_resolver_func (const tree default_decl,
29329 const tree dispatch_decl,
29330 basic_block *empty_bb)
29332 char *resolver_name;
29333 tree decl, type, decl_name, t;
29334 bool is_uniq = false;
29336 /* IFUNC's have to be globally visible. So, if the default_decl is
29337 not, then the name of the IFUNC should be made unique. */
29338 if (TREE_PUBLIC (default_decl) == 0)
29339 is_uniq = true;
29341 /* Append the filename to the resolver function if the versions are
29342 not externally visible. This is because the resolver function has
29343 to be externally visible for the loader to find it. So, appending
29344 the filename will prevent conflicts with a resolver function from
29345 another module which is based on the same version name. */
29346 resolver_name = make_name (default_decl, "resolver", is_uniq);
29348 /* The resolver function should return a (void *). */
29349 type = build_function_type_list (ptr_type_node, NULL_TREE);
29351 decl = build_fn_decl (resolver_name, type);
29352 decl_name = get_identifier (resolver_name);
29353 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
29355 DECL_NAME (decl) = decl_name;
29356 TREE_USED (decl) = 1;
29357 DECL_ARTIFICIAL (decl) = 1;
29358 DECL_IGNORED_P (decl) = 0;
29359 /* IFUNC resolvers have to be externally visible. */
29360 TREE_PUBLIC (decl) = 1;
29361 DECL_UNINLINABLE (decl) = 0;
29363 /* Resolver is not external, body is generated. */
29364 DECL_EXTERNAL (decl) = 0;
29365 DECL_EXTERNAL (dispatch_decl) = 0;
29367 DECL_CONTEXT (decl) = NULL_TREE;
29368 DECL_INITIAL (decl) = make_node (BLOCK);
29369 DECL_STATIC_CONSTRUCTOR (decl) = 0;
29371 if (DECL_COMDAT_GROUP (default_decl)
29372 || TREE_PUBLIC (default_decl))
29374 /* In this case, each translation unit with a call to this
29375 versioned function will put out a resolver. Ensure it
29376 is comdat to keep just one copy. */
29377 DECL_COMDAT (decl) = 1;
29378 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
29380 /* Build result decl and add to function_decl. */
29381 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
29382 DECL_ARTIFICIAL (t) = 1;
29383 DECL_IGNORED_P (t) = 1;
29384 DECL_RESULT (decl) = t;
29386 gimplify_function_tree (decl);
29387 push_cfun (DECL_STRUCT_FUNCTION (decl));
29388 *empty_bb = init_lowered_empty_function (decl, false);
29390 cgraph_add_new_function (decl, true);
29391 cgraph_call_function_insertion_hooks (cgraph_get_create_node (decl));
29393 pop_cfun ();
29395 gcc_assert (dispatch_decl != NULL);
29396 /* Mark dispatch_decl as "ifunc" with resolver as resolver_name. */
29397 DECL_ATTRIBUTES (dispatch_decl)
29398 = make_attribute ("ifunc", resolver_name, DECL_ATTRIBUTES (dispatch_decl));
29400 /* Create the alias for dispatch to resolver here. */
29401 /*cgraph_create_function_alias (dispatch_decl, decl);*/
29402 cgraph_same_body_alias (NULL, dispatch_decl, decl);
29403 return decl;
29406 /* Generate the dispatching code body to dispatch multi-versioned function
29407 DECL. The target hook is called to process the "target" attributes and
29408 provide the code to dispatch the right function at run-time. NODE points
29409 to the dispatcher decl whose body will be created. */
29411 static tree
29412 ix86_generate_version_dispatcher_body (void *node_p)
29414 tree resolver_decl;
29415 basic_block empty_bb;
29416 vec<tree> fn_ver_vec = vNULL;
29417 tree default_ver_decl;
29418 struct cgraph_node *versn;
29419 struct cgraph_node *node;
29421 struct cgraph_function_version_info *node_version_info = NULL;
29422 struct cgraph_function_version_info *versn_info = NULL;
29424 node = (cgraph_node *)node_p;
29426 node_version_info = get_cgraph_node_version (node);
29427 gcc_assert (node->dispatcher_function
29428 && node_version_info != NULL);
29430 if (node_version_info->dispatcher_resolver)
29431 return node_version_info->dispatcher_resolver;
29433 /* The first version in the chain corresponds to the default version. */
29434 default_ver_decl = node_version_info->next->this_node->symbol.decl;
29436 /* node is going to be an alias, so remove the finalized bit. */
29437 node->local.finalized = false;
29439 resolver_decl = make_resolver_func (default_ver_decl,
29440 node->symbol.decl, &empty_bb);
29442 node_version_info->dispatcher_resolver = resolver_decl;
29444 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
29446 fn_ver_vec.create (2);
29448 for (versn_info = node_version_info->next; versn_info;
29449 versn_info = versn_info->next)
29451 versn = versn_info->this_node;
29452 /* Check for virtual functions here again, as by this time it should
29453 have been determined if this function needs a vtable index or
29454 not. This happens for methods in derived classes that override
29455 virtual methods in base classes but are not explicitly marked as
29456 virtual. */
29457 if (DECL_VINDEX (versn->symbol.decl))
29458 error_at (DECL_SOURCE_LOCATION (versn->symbol.decl),
29459 "Virtual function multiversioning not supported");
29460 fn_ver_vec.safe_push (versn->symbol.decl);
29463 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
29465 rebuild_cgraph_edges ();
29466 pop_cfun ();
29467 return resolver_decl;
29469 /* This builds the processor_model struct type defined in
29470 libgcc/config/i386/cpuinfo.c */
29472 static tree
29473 build_processor_model_struct (void)
29475 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
29476 "__cpu_features"};
29477 tree field = NULL_TREE, field_chain = NULL_TREE;
29478 int i;
29479 tree type = make_node (RECORD_TYPE);
29481 /* The first 3 fields are unsigned int. */
29482 for (i = 0; i < 3; ++i)
29484 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
29485 get_identifier (field_name[i]), unsigned_type_node);
29486 if (field_chain != NULL_TREE)
29487 DECL_CHAIN (field) = field_chain;
29488 field_chain = field;
29491 /* The last field is an array of unsigned integers of size one. */
29492 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
29493 get_identifier (field_name[3]),
29494 build_array_type (unsigned_type_node,
29495 build_index_type (size_one_node)));
29496 if (field_chain != NULL_TREE)
29497 DECL_CHAIN (field) = field_chain;
29498 field_chain = field;
29500 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
29501 return type;
29504 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
29506 static tree
29507 make_var_decl (tree type, const char *name)
29509 tree new_decl;
29511 new_decl = build_decl (UNKNOWN_LOCATION,
29512 VAR_DECL,
29513 get_identifier(name),
29514 type);
29516 DECL_EXTERNAL (new_decl) = 1;
29517 TREE_STATIC (new_decl) = 1;
29518 TREE_PUBLIC (new_decl) = 1;
29519 DECL_INITIAL (new_decl) = 0;
29520 DECL_ARTIFICIAL (new_decl) = 0;
29521 DECL_PRESERVE_P (new_decl) = 1;
29523 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
29524 assemble_variable (new_decl, 0, 0, 0);
29526 return new_decl;
29529 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
29530 into an integer defined in libgcc/config/i386/cpuinfo.c */
29532 static tree
29533 fold_builtin_cpu (tree fndecl, tree *args)
29535 unsigned int i;
29536 enum ix86_builtins fn_code = (enum ix86_builtins)
29537 DECL_FUNCTION_CODE (fndecl);
29538 tree param_string_cst = NULL;
29540 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
29541 enum processor_features
29543 F_CMOV = 0,
29544 F_MMX,
29545 F_POPCNT,
29546 F_SSE,
29547 F_SSE2,
29548 F_SSE3,
29549 F_SSSE3,
29550 F_SSE4_1,
29551 F_SSE4_2,
29552 F_AVX,
29553 F_AVX2,
29554 F_MAX
29557 /* These are the values for vendor types and cpu types and subtypes
29558 in cpuinfo.c. Cpu types and subtypes should be subtracted by
29559 the corresponding start value. */
29560 enum processor_model
29562 M_INTEL = 1,
29563 M_AMD,
29564 M_CPU_TYPE_START,
29565 M_INTEL_ATOM,
29566 M_INTEL_CORE2,
29567 M_INTEL_COREI7,
29568 M_AMDFAM10H,
29569 M_AMDFAM15H,
29570 M_CPU_SUBTYPE_START,
29571 M_INTEL_COREI7_NEHALEM,
29572 M_INTEL_COREI7_WESTMERE,
29573 M_INTEL_COREI7_SANDYBRIDGE,
29574 M_AMDFAM10H_BARCELONA,
29575 M_AMDFAM10H_SHANGHAI,
29576 M_AMDFAM10H_ISTANBUL,
29577 M_AMDFAM15H_BDVER1,
29578 M_AMDFAM15H_BDVER2,
29579 M_AMDFAM15H_BDVER3
29582 static struct _arch_names_table
29584 const char *const name;
29585 const enum processor_model model;
29587 const arch_names_table[] =
29589 {"amd", M_AMD},
29590 {"intel", M_INTEL},
29591 {"atom", M_INTEL_ATOM},
29592 {"core2", M_INTEL_CORE2},
29593 {"corei7", M_INTEL_COREI7},
29594 {"nehalem", M_INTEL_COREI7_NEHALEM},
29595 {"westmere", M_INTEL_COREI7_WESTMERE},
29596 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
29597 {"amdfam10h", M_AMDFAM10H},
29598 {"barcelona", M_AMDFAM10H_BARCELONA},
29599 {"shanghai", M_AMDFAM10H_SHANGHAI},
29600 {"istanbul", M_AMDFAM10H_ISTANBUL},
29601 {"amdfam15h", M_AMDFAM15H},
29602 {"bdver1", M_AMDFAM15H_BDVER1},
29603 {"bdver2", M_AMDFAM15H_BDVER2},
29604 {"bdver3", M_AMDFAM15H_BDVER3},
29607 static struct _isa_names_table
29609 const char *const name;
29610 const enum processor_features feature;
29612 const isa_names_table[] =
29614 {"cmov", F_CMOV},
29615 {"mmx", F_MMX},
29616 {"popcnt", F_POPCNT},
29617 {"sse", F_SSE},
29618 {"sse2", F_SSE2},
29619 {"sse3", F_SSE3},
29620 {"ssse3", F_SSSE3},
29621 {"sse4.1", F_SSE4_1},
29622 {"sse4.2", F_SSE4_2},
29623 {"avx", F_AVX},
29624 {"avx2", F_AVX2}
29627 tree __processor_model_type = build_processor_model_struct ();
29628 tree __cpu_model_var = make_var_decl (__processor_model_type,
29629 "__cpu_model");
29631 gcc_assert ((args != NULL) && (*args != NULL));
29633 param_string_cst = *args;
29634 while (param_string_cst
29635 && TREE_CODE (param_string_cst) != STRING_CST)
29637 /* *args must be a expr that can contain other EXPRS leading to a
29638 STRING_CST. */
29639 if (!EXPR_P (param_string_cst))
29641 error ("Parameter to builtin must be a string constant or literal");
29642 return integer_zero_node;
29644 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
29647 gcc_assert (param_string_cst);
29649 if (fn_code == IX86_BUILTIN_CPU_IS)
29651 tree ref;
29652 tree field;
29653 tree final;
29655 unsigned int field_val = 0;
29656 unsigned int NUM_ARCH_NAMES
29657 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
29659 for (i = 0; i < NUM_ARCH_NAMES; i++)
29660 if (strcmp (arch_names_table[i].name,
29661 TREE_STRING_POINTER (param_string_cst)) == 0)
29662 break;
29664 if (i == NUM_ARCH_NAMES)
29666 error ("Parameter to builtin not valid: %s",
29667 TREE_STRING_POINTER (param_string_cst));
29668 return integer_zero_node;
29671 field = TYPE_FIELDS (__processor_model_type);
29672 field_val = arch_names_table[i].model;
29674 /* CPU types are stored in the next field. */
29675 if (field_val > M_CPU_TYPE_START
29676 && field_val < M_CPU_SUBTYPE_START)
29678 field = DECL_CHAIN (field);
29679 field_val -= M_CPU_TYPE_START;
29682 /* CPU subtypes are stored in the next field. */
29683 if (field_val > M_CPU_SUBTYPE_START)
29685 field = DECL_CHAIN ( DECL_CHAIN (field));
29686 field_val -= M_CPU_SUBTYPE_START;
29689 /* Get the appropriate field in __cpu_model. */
29690 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
29691 field, NULL_TREE);
29693 /* Check the value. */
29694 final = build2 (EQ_EXPR, unsigned_type_node, ref,
29695 build_int_cstu (unsigned_type_node, field_val));
29696 return build1 (CONVERT_EXPR, integer_type_node, final);
29698 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
29700 tree ref;
29701 tree array_elt;
29702 tree field;
29703 tree final;
29705 unsigned int field_val = 0;
29706 unsigned int NUM_ISA_NAMES
29707 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
29709 for (i = 0; i < NUM_ISA_NAMES; i++)
29710 if (strcmp (isa_names_table[i].name,
29711 TREE_STRING_POINTER (param_string_cst)) == 0)
29712 break;
29714 if (i == NUM_ISA_NAMES)
29716 error ("Parameter to builtin not valid: %s",
29717 TREE_STRING_POINTER (param_string_cst));
29718 return integer_zero_node;
29721 field = TYPE_FIELDS (__processor_model_type);
29722 /* Get the last field, which is __cpu_features. */
29723 while (DECL_CHAIN (field))
29724 field = DECL_CHAIN (field);
29726 /* Get the appropriate field: __cpu_model.__cpu_features */
29727 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
29728 field, NULL_TREE);
29730 /* Access the 0th element of __cpu_features array. */
29731 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
29732 integer_zero_node, NULL_TREE, NULL_TREE);
29734 field_val = (1 << isa_names_table[i].feature);
29735 /* Return __cpu_model.__cpu_features[0] & field_val */
29736 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
29737 build_int_cstu (unsigned_type_node, field_val));
29738 return build1 (CONVERT_EXPR, integer_type_node, final);
29740 gcc_unreachable ();
29743 static tree
29744 ix86_fold_builtin (tree fndecl, int n_args,
29745 tree *args, bool ignore ATTRIBUTE_UNUSED)
29747 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
29749 enum ix86_builtins fn_code = (enum ix86_builtins)
29750 DECL_FUNCTION_CODE (fndecl);
29751 if (fn_code == IX86_BUILTIN_CPU_IS
29752 || fn_code == IX86_BUILTIN_CPU_SUPPORTS)
29754 gcc_assert (n_args == 1);
29755 return fold_builtin_cpu (fndecl, args);
29759 #ifdef SUBTARGET_FOLD_BUILTIN
29760 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
29761 #endif
29763 return NULL_TREE;
29766 /* Make builtins to detect cpu type and features supported. NAME is
29767 the builtin name, CODE is the builtin code, and FTYPE is the function
29768 type of the builtin. */
29770 static void
29771 make_cpu_type_builtin (const char* name, int code,
29772 enum ix86_builtin_func_type ftype, bool is_const)
29774 tree decl;
29775 tree type;
29777 type = ix86_get_builtin_func_type (ftype);
29778 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
29779 NULL, NULL_TREE);
29780 gcc_assert (decl != NULL_TREE);
29781 ix86_builtins[(int) code] = decl;
29782 TREE_READONLY (decl) = is_const;
29785 /* Make builtins to get CPU type and features supported. The created
29786 builtins are :
29788 __builtin_cpu_init (), to detect cpu type and features,
29789 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
29790 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
29793 static void
29794 ix86_init_platform_type_builtins (void)
29796 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
29797 INT_FTYPE_VOID, false);
29798 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
29799 INT_FTYPE_PCCHAR, true);
29800 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
29801 INT_FTYPE_PCCHAR, true);
29804 /* Internal method for ix86_init_builtins. */
29806 static void
29807 ix86_init_builtins_va_builtins_abi (void)
29809 tree ms_va_ref, sysv_va_ref;
29810 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
29811 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
29812 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
29813 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
29815 if (!TARGET_64BIT)
29816 return;
29817 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
29818 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
29819 ms_va_ref = build_reference_type (ms_va_list_type_node);
29820 sysv_va_ref =
29821 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
29823 fnvoid_va_end_ms =
29824 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
29825 fnvoid_va_start_ms =
29826 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
29827 fnvoid_va_end_sysv =
29828 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
29829 fnvoid_va_start_sysv =
29830 build_varargs_function_type_list (void_type_node, sysv_va_ref,
29831 NULL_TREE);
29832 fnvoid_va_copy_ms =
29833 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
29834 NULL_TREE);
29835 fnvoid_va_copy_sysv =
29836 build_function_type_list (void_type_node, sysv_va_ref,
29837 sysv_va_ref, NULL_TREE);
29839 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
29840 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
29841 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
29842 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
29843 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
29844 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
29845 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
29846 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
29847 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
29848 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
29849 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
29850 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
29853 static void
29854 ix86_init_builtin_types (void)
29856 tree float128_type_node, float80_type_node;
29858 /* The __float80 type. */
29859 float80_type_node = long_double_type_node;
29860 if (TYPE_MODE (float80_type_node) != XFmode)
29862 /* The __float80 type. */
29863 float80_type_node = make_node (REAL_TYPE);
29865 TYPE_PRECISION (float80_type_node) = 80;
29866 layout_type (float80_type_node);
29868 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
29870 /* The __float128 type. */
29871 float128_type_node = make_node (REAL_TYPE);
29872 TYPE_PRECISION (float128_type_node) = 128;
29873 layout_type (float128_type_node);
29874 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
29876 /* This macro is built by i386-builtin-types.awk. */
29877 DEFINE_BUILTIN_PRIMITIVE_TYPES;
29880 static void
29881 ix86_init_builtins (void)
29883 tree t;
29885 ix86_init_builtin_types ();
29887 /* Builtins to get CPU type and features. */
29888 ix86_init_platform_type_builtins ();
29890 /* TFmode support builtins. */
29891 def_builtin_const (0, "__builtin_infq",
29892 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
29893 def_builtin_const (0, "__builtin_huge_valq",
29894 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
29896 /* We will expand them to normal call if SSE isn't available since
29897 they are used by libgcc. */
29898 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
29899 t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
29900 BUILT_IN_MD, "__fabstf2", NULL_TREE);
29901 TREE_READONLY (t) = 1;
29902 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
29904 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
29905 t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
29906 BUILT_IN_MD, "__copysigntf3", NULL_TREE);
29907 TREE_READONLY (t) = 1;
29908 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
29910 ix86_init_tm_builtins ();
29911 ix86_init_mmx_sse_builtins ();
29913 if (TARGET_LP64)
29914 ix86_init_builtins_va_builtins_abi ();
29916 #ifdef SUBTARGET_INIT_BUILTINS
29917 SUBTARGET_INIT_BUILTINS;
29918 #endif
29921 /* Return the ix86 builtin for CODE. */
29923 static tree
29924 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
29926 if (code >= IX86_BUILTIN_MAX)
29927 return error_mark_node;
29929 return ix86_builtins[code];
29932 /* Errors in the source file can cause expand_expr to return const0_rtx
29933 where we expect a vector. To avoid crashing, use one of the vector
29934 clear instructions. */
29935 static rtx
29936 safe_vector_operand (rtx x, enum machine_mode mode)
29938 if (x == const0_rtx)
29939 x = CONST0_RTX (mode);
29940 return x;
29943 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
29945 static rtx
29946 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
29948 rtx pat;
29949 tree arg0 = CALL_EXPR_ARG (exp, 0);
29950 tree arg1 = CALL_EXPR_ARG (exp, 1);
29951 rtx op0 = expand_normal (arg0);
29952 rtx op1 = expand_normal (arg1);
29953 enum machine_mode tmode = insn_data[icode].operand[0].mode;
29954 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
29955 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
29957 if (VECTOR_MODE_P (mode0))
29958 op0 = safe_vector_operand (op0, mode0);
29959 if (VECTOR_MODE_P (mode1))
29960 op1 = safe_vector_operand (op1, mode1);
29962 if (optimize || !target
29963 || GET_MODE (target) != tmode
29964 || !insn_data[icode].operand[0].predicate (target, tmode))
29965 target = gen_reg_rtx (tmode);
29967 if (GET_MODE (op1) == SImode && mode1 == TImode)
29969 rtx x = gen_reg_rtx (V4SImode);
29970 emit_insn (gen_sse2_loadd (x, op1));
29971 op1 = gen_lowpart (TImode, x);
29974 if (!insn_data[icode].operand[1].predicate (op0, mode0))
29975 op0 = copy_to_mode_reg (mode0, op0);
29976 if (!insn_data[icode].operand[2].predicate (op1, mode1))
29977 op1 = copy_to_mode_reg (mode1, op1);
29979 pat = GEN_FCN (icode) (target, op0, op1);
29980 if (! pat)
29981 return 0;
29983 emit_insn (pat);
29985 return target;
29988 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
29990 static rtx
29991 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
29992 enum ix86_builtin_func_type m_type,
29993 enum rtx_code sub_code)
29995 rtx pat;
29996 int i;
29997 int nargs;
29998 bool comparison_p = false;
29999 bool tf_p = false;
30000 bool last_arg_constant = false;
30001 int num_memory = 0;
30002 struct {
30003 rtx op;
30004 enum machine_mode mode;
30005 } args[4];
30007 enum machine_mode tmode = insn_data[icode].operand[0].mode;
30009 switch (m_type)
30011 case MULTI_ARG_4_DF2_DI_I:
30012 case MULTI_ARG_4_DF2_DI_I1:
30013 case MULTI_ARG_4_SF2_SI_I:
30014 case MULTI_ARG_4_SF2_SI_I1:
30015 nargs = 4;
30016 last_arg_constant = true;
30017 break;
30019 case MULTI_ARG_3_SF:
30020 case MULTI_ARG_3_DF:
30021 case MULTI_ARG_3_SF2:
30022 case MULTI_ARG_3_DF2:
30023 case MULTI_ARG_3_DI:
30024 case MULTI_ARG_3_SI:
30025 case MULTI_ARG_3_SI_DI:
30026 case MULTI_ARG_3_HI:
30027 case MULTI_ARG_3_HI_SI:
30028 case MULTI_ARG_3_QI:
30029 case MULTI_ARG_3_DI2:
30030 case MULTI_ARG_3_SI2:
30031 case MULTI_ARG_3_HI2:
30032 case MULTI_ARG_3_QI2:
30033 nargs = 3;
30034 break;
30036 case MULTI_ARG_2_SF:
30037 case MULTI_ARG_2_DF:
30038 case MULTI_ARG_2_DI:
30039 case MULTI_ARG_2_SI:
30040 case MULTI_ARG_2_HI:
30041 case MULTI_ARG_2_QI:
30042 nargs = 2;
30043 break;
30045 case MULTI_ARG_2_DI_IMM:
30046 case MULTI_ARG_2_SI_IMM:
30047 case MULTI_ARG_2_HI_IMM:
30048 case MULTI_ARG_2_QI_IMM:
30049 nargs = 2;
30050 last_arg_constant = true;
30051 break;
30053 case MULTI_ARG_1_SF:
30054 case MULTI_ARG_1_DF:
30055 case MULTI_ARG_1_SF2:
30056 case MULTI_ARG_1_DF2:
30057 case MULTI_ARG_1_DI:
30058 case MULTI_ARG_1_SI:
30059 case MULTI_ARG_1_HI:
30060 case MULTI_ARG_1_QI:
30061 case MULTI_ARG_1_SI_DI:
30062 case MULTI_ARG_1_HI_DI:
30063 case MULTI_ARG_1_HI_SI:
30064 case MULTI_ARG_1_QI_DI:
30065 case MULTI_ARG_1_QI_SI:
30066 case MULTI_ARG_1_QI_HI:
30067 nargs = 1;
30068 break;
30070 case MULTI_ARG_2_DI_CMP:
30071 case MULTI_ARG_2_SI_CMP:
30072 case MULTI_ARG_2_HI_CMP:
30073 case MULTI_ARG_2_QI_CMP:
30074 nargs = 2;
30075 comparison_p = true;
30076 break;
30078 case MULTI_ARG_2_SF_TF:
30079 case MULTI_ARG_2_DF_TF:
30080 case MULTI_ARG_2_DI_TF:
30081 case MULTI_ARG_2_SI_TF:
30082 case MULTI_ARG_2_HI_TF:
30083 case MULTI_ARG_2_QI_TF:
30084 nargs = 2;
30085 tf_p = true;
30086 break;
30088 default:
30089 gcc_unreachable ();
30092 if (optimize || !target
30093 || GET_MODE (target) != tmode
30094 || !insn_data[icode].operand[0].predicate (target, tmode))
30095 target = gen_reg_rtx (tmode);
30097 gcc_assert (nargs <= 4);
30099 for (i = 0; i < nargs; i++)
30101 tree arg = CALL_EXPR_ARG (exp, i);
30102 rtx op = expand_normal (arg);
30103 int adjust = (comparison_p) ? 1 : 0;
30104 enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
30106 if (last_arg_constant && i == nargs - 1)
30108 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
30110 enum insn_code new_icode = icode;
30111 switch (icode)
30113 case CODE_FOR_xop_vpermil2v2df3:
30114 case CODE_FOR_xop_vpermil2v4sf3:
30115 case CODE_FOR_xop_vpermil2v4df3:
30116 case CODE_FOR_xop_vpermil2v8sf3:
30117 error ("the last argument must be a 2-bit immediate");
30118 return gen_reg_rtx (tmode);
30119 case CODE_FOR_xop_rotlv2di3:
30120 new_icode = CODE_FOR_rotlv2di3;
30121 goto xop_rotl;
30122 case CODE_FOR_xop_rotlv4si3:
30123 new_icode = CODE_FOR_rotlv4si3;
30124 goto xop_rotl;
30125 case CODE_FOR_xop_rotlv8hi3:
30126 new_icode = CODE_FOR_rotlv8hi3;
30127 goto xop_rotl;
30128 case CODE_FOR_xop_rotlv16qi3:
30129 new_icode = CODE_FOR_rotlv16qi3;
30130 xop_rotl:
30131 if (CONST_INT_P (op))
30133 int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
30134 op = GEN_INT (INTVAL (op) & mask);
30135 gcc_checking_assert
30136 (insn_data[icode].operand[i + 1].predicate (op, mode));
30138 else
30140 gcc_checking_assert
30141 (nargs == 2
30142 && insn_data[new_icode].operand[0].mode == tmode
30143 && insn_data[new_icode].operand[1].mode == tmode
30144 && insn_data[new_icode].operand[2].mode == mode
30145 && insn_data[new_icode].operand[0].predicate
30146 == insn_data[icode].operand[0].predicate
30147 && insn_data[new_icode].operand[1].predicate
30148 == insn_data[icode].operand[1].predicate);
30149 icode = new_icode;
30150 goto non_constant;
30152 break;
30153 default:
30154 gcc_unreachable ();
30158 else
30160 non_constant:
30161 if (VECTOR_MODE_P (mode))
30162 op = safe_vector_operand (op, mode);
30164 /* If we aren't optimizing, only allow one memory operand to be
30165 generated. */
30166 if (memory_operand (op, mode))
30167 num_memory++;
30169 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
30171 if (optimize
30172 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
30173 || num_memory > 1)
30174 op = force_reg (mode, op);
30177 args[i].op = op;
30178 args[i].mode = mode;
30181 switch (nargs)
30183 case 1:
30184 pat = GEN_FCN (icode) (target, args[0].op);
30185 break;
30187 case 2:
30188 if (tf_p)
30189 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
30190 GEN_INT ((int)sub_code));
30191 else if (! comparison_p)
30192 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
30193 else
30195 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
30196 args[0].op,
30197 args[1].op);
30199 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
30201 break;
30203 case 3:
30204 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
30205 break;
30207 case 4:
30208 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
30209 break;
30211 default:
30212 gcc_unreachable ();
30215 if (! pat)
30216 return 0;
30218 emit_insn (pat);
30219 return target;
30222 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
30223 insns with vec_merge. */
30225 static rtx
30226 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
30227 rtx target)
30229 rtx pat;
30230 tree arg0 = CALL_EXPR_ARG (exp, 0);
30231 rtx op1, op0 = expand_normal (arg0);
30232 enum machine_mode tmode = insn_data[icode].operand[0].mode;
30233 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
30235 if (optimize || !target
30236 || GET_MODE (target) != tmode
30237 || !insn_data[icode].operand[0].predicate (target, tmode))
30238 target = gen_reg_rtx (tmode);
30240 if (VECTOR_MODE_P (mode0))
30241 op0 = safe_vector_operand (op0, mode0);
30243 if ((optimize && !register_operand (op0, mode0))
30244 || !insn_data[icode].operand[1].predicate (op0, mode0))
30245 op0 = copy_to_mode_reg (mode0, op0);
30247 op1 = op0;
30248 if (!insn_data[icode].operand[2].predicate (op1, mode0))
30249 op1 = copy_to_mode_reg (mode0, op1);
30251 pat = GEN_FCN (icode) (target, op0, op1);
30252 if (! pat)
30253 return 0;
30254 emit_insn (pat);
30255 return target;
30258 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
30260 static rtx
30261 ix86_expand_sse_compare (const struct builtin_description *d,
30262 tree exp, rtx target, bool swap)
30264 rtx pat;
30265 tree arg0 = CALL_EXPR_ARG (exp, 0);
30266 tree arg1 = CALL_EXPR_ARG (exp, 1);
30267 rtx op0 = expand_normal (arg0);
30268 rtx op1 = expand_normal (arg1);
30269 rtx op2;
30270 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
30271 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
30272 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
30273 enum rtx_code comparison = d->comparison;
30275 if (VECTOR_MODE_P (mode0))
30276 op0 = safe_vector_operand (op0, mode0);
30277 if (VECTOR_MODE_P (mode1))
30278 op1 = safe_vector_operand (op1, mode1);
30280 /* Swap operands if we have a comparison that isn't available in
30281 hardware. */
30282 if (swap)
30284 rtx tmp = gen_reg_rtx (mode1);
30285 emit_move_insn (tmp, op1);
30286 op1 = op0;
30287 op0 = tmp;
30290 if (optimize || !target
30291 || GET_MODE (target) != tmode
30292 || !insn_data[d->icode].operand[0].predicate (target, tmode))
30293 target = gen_reg_rtx (tmode);
30295 if ((optimize && !register_operand (op0, mode0))
30296 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
30297 op0 = copy_to_mode_reg (mode0, op0);
30298 if ((optimize && !register_operand (op1, mode1))
30299 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
30300 op1 = copy_to_mode_reg (mode1, op1);
30302 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
30303 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
30304 if (! pat)
30305 return 0;
30306 emit_insn (pat);
30307 return target;
30310 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
30312 static rtx
30313 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
30314 rtx target)
30316 rtx pat;
30317 tree arg0 = CALL_EXPR_ARG (exp, 0);
30318 tree arg1 = CALL_EXPR_ARG (exp, 1);
30319 rtx op0 = expand_normal (arg0);
30320 rtx op1 = expand_normal (arg1);
30321 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
30322 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
30323 enum rtx_code comparison = d->comparison;
30325 if (VECTOR_MODE_P (mode0))
30326 op0 = safe_vector_operand (op0, mode0);
30327 if (VECTOR_MODE_P (mode1))
30328 op1 = safe_vector_operand (op1, mode1);
30330 /* Swap operands if we have a comparison that isn't available in
30331 hardware. */
30332 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
30334 rtx tmp = op1;
30335 op1 = op0;
30336 op0 = tmp;
30339 target = gen_reg_rtx (SImode);
30340 emit_move_insn (target, const0_rtx);
30341 target = gen_rtx_SUBREG (QImode, target, 0);
30343 if ((optimize && !register_operand (op0, mode0))
30344 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30345 op0 = copy_to_mode_reg (mode0, op0);
30346 if ((optimize && !register_operand (op1, mode1))
30347 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
30348 op1 = copy_to_mode_reg (mode1, op1);
30350 pat = GEN_FCN (d->icode) (op0, op1);
30351 if (! pat)
30352 return 0;
30353 emit_insn (pat);
30354 emit_insn (gen_rtx_SET (VOIDmode,
30355 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30356 gen_rtx_fmt_ee (comparison, QImode,
30357 SET_DEST (pat),
30358 const0_rtx)));
30360 return SUBREG_REG (target);
30363 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
30365 static rtx
30366 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
30367 rtx target)
30369 rtx pat;
30370 tree arg0 = CALL_EXPR_ARG (exp, 0);
30371 rtx op1, op0 = expand_normal (arg0);
30372 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
30373 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
30375 if (optimize || target == 0
30376 || GET_MODE (target) != tmode
30377 || !insn_data[d->icode].operand[0].predicate (target, tmode))
30378 target = gen_reg_rtx (tmode);
30380 if (VECTOR_MODE_P (mode0))
30381 op0 = safe_vector_operand (op0, mode0);
30383 if ((optimize && !register_operand (op0, mode0))
30384 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30385 op0 = copy_to_mode_reg (mode0, op0);
30387 op1 = GEN_INT (d->comparison);
30389 pat = GEN_FCN (d->icode) (target, op0, op1);
30390 if (! pat)
30391 return 0;
30392 emit_insn (pat);
30393 return target;
30396 static rtx
30397 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
30398 tree exp, rtx target)
30400 rtx pat;
30401 tree arg0 = CALL_EXPR_ARG (exp, 0);
30402 tree arg1 = CALL_EXPR_ARG (exp, 1);
30403 rtx op0 = expand_normal (arg0);
30404 rtx op1 = expand_normal (arg1);
30405 rtx op2;
30406 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
30407 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
30408 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
30410 if (optimize || target == 0
30411 || GET_MODE (target) != tmode
30412 || !insn_data[d->icode].operand[0].predicate (target, tmode))
30413 target = gen_reg_rtx (tmode);
30415 op0 = safe_vector_operand (op0, mode0);
30416 op1 = safe_vector_operand (op1, mode1);
30418 if ((optimize && !register_operand (op0, mode0))
30419 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30420 op0 = copy_to_mode_reg (mode0, op0);
30421 if ((optimize && !register_operand (op1, mode1))
30422 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
30423 op1 = copy_to_mode_reg (mode1, op1);
30425 op2 = GEN_INT (d->comparison);
30427 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
30428 if (! pat)
30429 return 0;
30430 emit_insn (pat);
30431 return target;
30434 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
30436 static rtx
30437 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
30438 rtx target)
30440 rtx pat;
30441 tree arg0 = CALL_EXPR_ARG (exp, 0);
30442 tree arg1 = CALL_EXPR_ARG (exp, 1);
30443 rtx op0 = expand_normal (arg0);
30444 rtx op1 = expand_normal (arg1);
30445 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
30446 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
30447 enum rtx_code comparison = d->comparison;
30449 if (VECTOR_MODE_P (mode0))
30450 op0 = safe_vector_operand (op0, mode0);
30451 if (VECTOR_MODE_P (mode1))
30452 op1 = safe_vector_operand (op1, mode1);
30454 target = gen_reg_rtx (SImode);
30455 emit_move_insn (target, const0_rtx);
30456 target = gen_rtx_SUBREG (QImode, target, 0);
30458 if ((optimize && !register_operand (op0, mode0))
30459 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30460 op0 = copy_to_mode_reg (mode0, op0);
30461 if ((optimize && !register_operand (op1, mode1))
30462 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
30463 op1 = copy_to_mode_reg (mode1, op1);
30465 pat = GEN_FCN (d->icode) (op0, op1);
30466 if (! pat)
30467 return 0;
30468 emit_insn (pat);
30469 emit_insn (gen_rtx_SET (VOIDmode,
30470 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30471 gen_rtx_fmt_ee (comparison, QImode,
30472 SET_DEST (pat),
30473 const0_rtx)));
30475 return SUBREG_REG (target);
30478 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
30480 static rtx
30481 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
30482 tree exp, rtx target)
30484 rtx pat;
30485 tree arg0 = CALL_EXPR_ARG (exp, 0);
30486 tree arg1 = CALL_EXPR_ARG (exp, 1);
30487 tree arg2 = CALL_EXPR_ARG (exp, 2);
30488 tree arg3 = CALL_EXPR_ARG (exp, 3);
30489 tree arg4 = CALL_EXPR_ARG (exp, 4);
30490 rtx scratch0, scratch1;
30491 rtx op0 = expand_normal (arg0);
30492 rtx op1 = expand_normal (arg1);
30493 rtx op2 = expand_normal (arg2);
30494 rtx op3 = expand_normal (arg3);
30495 rtx op4 = expand_normal (arg4);
30496 enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
30498 tmode0 = insn_data[d->icode].operand[0].mode;
30499 tmode1 = insn_data[d->icode].operand[1].mode;
30500 modev2 = insn_data[d->icode].operand[2].mode;
30501 modei3 = insn_data[d->icode].operand[3].mode;
30502 modev4 = insn_data[d->icode].operand[4].mode;
30503 modei5 = insn_data[d->icode].operand[5].mode;
30504 modeimm = insn_data[d->icode].operand[6].mode;
30506 if (VECTOR_MODE_P (modev2))
30507 op0 = safe_vector_operand (op0, modev2);
30508 if (VECTOR_MODE_P (modev4))
30509 op2 = safe_vector_operand (op2, modev4);
30511 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
30512 op0 = copy_to_mode_reg (modev2, op0);
30513 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
30514 op1 = copy_to_mode_reg (modei3, op1);
30515 if ((optimize && !register_operand (op2, modev4))
30516 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
30517 op2 = copy_to_mode_reg (modev4, op2);
30518 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
30519 op3 = copy_to_mode_reg (modei5, op3);
30521 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
30523 error ("the fifth argument must be an 8-bit immediate");
30524 return const0_rtx;
30527 if (d->code == IX86_BUILTIN_PCMPESTRI128)
30529 if (optimize || !target
30530 || GET_MODE (target) != tmode0
30531 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
30532 target = gen_reg_rtx (tmode0);
30534 scratch1 = gen_reg_rtx (tmode1);
30536 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
30538 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
30540 if (optimize || !target
30541 || GET_MODE (target) != tmode1
30542 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
30543 target = gen_reg_rtx (tmode1);
30545 scratch0 = gen_reg_rtx (tmode0);
30547 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
30549 else
30551 gcc_assert (d->flag);
30553 scratch0 = gen_reg_rtx (tmode0);
30554 scratch1 = gen_reg_rtx (tmode1);
30556 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
30559 if (! pat)
30560 return 0;
30562 emit_insn (pat);
30564 if (d->flag)
30566 target = gen_reg_rtx (SImode);
30567 emit_move_insn (target, const0_rtx);
30568 target = gen_rtx_SUBREG (QImode, target, 0);
30570 emit_insn
30571 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30572 gen_rtx_fmt_ee (EQ, QImode,
30573 gen_rtx_REG ((enum machine_mode) d->flag,
30574 FLAGS_REG),
30575 const0_rtx)));
30576 return SUBREG_REG (target);
30578 else
30579 return target;
30583 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
30585 static rtx
30586 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
30587 tree exp, rtx target)
30589 rtx pat;
30590 tree arg0 = CALL_EXPR_ARG (exp, 0);
30591 tree arg1 = CALL_EXPR_ARG (exp, 1);
30592 tree arg2 = CALL_EXPR_ARG (exp, 2);
30593 rtx scratch0, scratch1;
30594 rtx op0 = expand_normal (arg0);
30595 rtx op1 = expand_normal (arg1);
30596 rtx op2 = expand_normal (arg2);
30597 enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
30599 tmode0 = insn_data[d->icode].operand[0].mode;
30600 tmode1 = insn_data[d->icode].operand[1].mode;
30601 modev2 = insn_data[d->icode].operand[2].mode;
30602 modev3 = insn_data[d->icode].operand[3].mode;
30603 modeimm = insn_data[d->icode].operand[4].mode;
30605 if (VECTOR_MODE_P (modev2))
30606 op0 = safe_vector_operand (op0, modev2);
30607 if (VECTOR_MODE_P (modev3))
30608 op1 = safe_vector_operand (op1, modev3);
30610 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
30611 op0 = copy_to_mode_reg (modev2, op0);
30612 if ((optimize && !register_operand (op1, modev3))
30613 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
30614 op1 = copy_to_mode_reg (modev3, op1);
30616 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
30618 error ("the third argument must be an 8-bit immediate");
30619 return const0_rtx;
30622 if (d->code == IX86_BUILTIN_PCMPISTRI128)
30624 if (optimize || !target
30625 || GET_MODE (target) != tmode0
30626 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
30627 target = gen_reg_rtx (tmode0);
30629 scratch1 = gen_reg_rtx (tmode1);
30631 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
30633 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
30635 if (optimize || !target
30636 || GET_MODE (target) != tmode1
30637 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
30638 target = gen_reg_rtx (tmode1);
30640 scratch0 = gen_reg_rtx (tmode0);
30642 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
30644 else
30646 gcc_assert (d->flag);
30648 scratch0 = gen_reg_rtx (tmode0);
30649 scratch1 = gen_reg_rtx (tmode1);
30651 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
30654 if (! pat)
30655 return 0;
30657 emit_insn (pat);
30659 if (d->flag)
30661 target = gen_reg_rtx (SImode);
30662 emit_move_insn (target, const0_rtx);
30663 target = gen_rtx_SUBREG (QImode, target, 0);
30665 emit_insn
30666 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30667 gen_rtx_fmt_ee (EQ, QImode,
30668 gen_rtx_REG ((enum machine_mode) d->flag,
30669 FLAGS_REG),
30670 const0_rtx)));
30671 return SUBREG_REG (target);
30673 else
30674 return target;
30677 /* Subroutine of ix86_expand_builtin to take care of insns with
30678 variable number of operands. */
30680 static rtx
30681 ix86_expand_args_builtin (const struct builtin_description *d,
30682 tree exp, rtx target)
30684 rtx pat, real_target;
30685 unsigned int i, nargs;
30686 unsigned int nargs_constant = 0;
30687 int num_memory = 0;
30688 struct
30690 rtx op;
30691 enum machine_mode mode;
30692 } args[4];
30693 bool last_arg_count = false;
30694 enum insn_code icode = d->icode;
30695 const struct insn_data_d *insn_p = &insn_data[icode];
30696 enum machine_mode tmode = insn_p->operand[0].mode;
30697 enum machine_mode rmode = VOIDmode;
30698 bool swap = false;
30699 enum rtx_code comparison = d->comparison;
30701 switch ((enum ix86_builtin_func_type) d->flag)
30703 case V2DF_FTYPE_V2DF_ROUND:
30704 case V4DF_FTYPE_V4DF_ROUND:
30705 case V4SF_FTYPE_V4SF_ROUND:
30706 case V8SF_FTYPE_V8SF_ROUND:
30707 case V4SI_FTYPE_V4SF_ROUND:
30708 case V8SI_FTYPE_V8SF_ROUND:
30709 return ix86_expand_sse_round (d, exp, target);
30710 case V4SI_FTYPE_V2DF_V2DF_ROUND:
30711 case V8SI_FTYPE_V4DF_V4DF_ROUND:
30712 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
30713 case INT_FTYPE_V8SF_V8SF_PTEST:
30714 case INT_FTYPE_V4DI_V4DI_PTEST:
30715 case INT_FTYPE_V4DF_V4DF_PTEST:
30716 case INT_FTYPE_V4SF_V4SF_PTEST:
30717 case INT_FTYPE_V2DI_V2DI_PTEST:
30718 case INT_FTYPE_V2DF_V2DF_PTEST:
30719 return ix86_expand_sse_ptest (d, exp, target);
30720 case FLOAT128_FTYPE_FLOAT128:
30721 case FLOAT_FTYPE_FLOAT:
30722 case INT_FTYPE_INT:
30723 case UINT64_FTYPE_INT:
30724 case UINT16_FTYPE_UINT16:
30725 case INT64_FTYPE_INT64:
30726 case INT64_FTYPE_V4SF:
30727 case INT64_FTYPE_V2DF:
30728 case INT_FTYPE_V16QI:
30729 case INT_FTYPE_V8QI:
30730 case INT_FTYPE_V8SF:
30731 case INT_FTYPE_V4DF:
30732 case INT_FTYPE_V4SF:
30733 case INT_FTYPE_V2DF:
30734 case INT_FTYPE_V32QI:
30735 case V16QI_FTYPE_V16QI:
30736 case V8SI_FTYPE_V8SF:
30737 case V8SI_FTYPE_V4SI:
30738 case V8HI_FTYPE_V8HI:
30739 case V8HI_FTYPE_V16QI:
30740 case V8QI_FTYPE_V8QI:
30741 case V8SF_FTYPE_V8SF:
30742 case V8SF_FTYPE_V8SI:
30743 case V8SF_FTYPE_V4SF:
30744 case V8SF_FTYPE_V8HI:
30745 case V4SI_FTYPE_V4SI:
30746 case V4SI_FTYPE_V16QI:
30747 case V4SI_FTYPE_V4SF:
30748 case V4SI_FTYPE_V8SI:
30749 case V4SI_FTYPE_V8HI:
30750 case V4SI_FTYPE_V4DF:
30751 case V4SI_FTYPE_V2DF:
30752 case V4HI_FTYPE_V4HI:
30753 case V4DF_FTYPE_V4DF:
30754 case V4DF_FTYPE_V4SI:
30755 case V4DF_FTYPE_V4SF:
30756 case V4DF_FTYPE_V2DF:
30757 case V4SF_FTYPE_V4SF:
30758 case V4SF_FTYPE_V4SI:
30759 case V4SF_FTYPE_V8SF:
30760 case V4SF_FTYPE_V4DF:
30761 case V4SF_FTYPE_V8HI:
30762 case V4SF_FTYPE_V2DF:
30763 case V2DI_FTYPE_V2DI:
30764 case V2DI_FTYPE_V16QI:
30765 case V2DI_FTYPE_V8HI:
30766 case V2DI_FTYPE_V4SI:
30767 case V2DF_FTYPE_V2DF:
30768 case V2DF_FTYPE_V4SI:
30769 case V2DF_FTYPE_V4DF:
30770 case V2DF_FTYPE_V4SF:
30771 case V2DF_FTYPE_V2SI:
30772 case V2SI_FTYPE_V2SI:
30773 case V2SI_FTYPE_V4SF:
30774 case V2SI_FTYPE_V2SF:
30775 case V2SI_FTYPE_V2DF:
30776 case V2SF_FTYPE_V2SF:
30777 case V2SF_FTYPE_V2SI:
30778 case V32QI_FTYPE_V32QI:
30779 case V32QI_FTYPE_V16QI:
30780 case V16HI_FTYPE_V16HI:
30781 case V16HI_FTYPE_V8HI:
30782 case V8SI_FTYPE_V8SI:
30783 case V16HI_FTYPE_V16QI:
30784 case V8SI_FTYPE_V16QI:
30785 case V4DI_FTYPE_V16QI:
30786 case V8SI_FTYPE_V8HI:
30787 case V4DI_FTYPE_V8HI:
30788 case V4DI_FTYPE_V4SI:
30789 case V4DI_FTYPE_V2DI:
30790 nargs = 1;
30791 break;
30792 case V4SF_FTYPE_V4SF_VEC_MERGE:
30793 case V2DF_FTYPE_V2DF_VEC_MERGE:
30794 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
30795 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
30796 case V16QI_FTYPE_V16QI_V16QI:
30797 case V16QI_FTYPE_V8HI_V8HI:
30798 case V8QI_FTYPE_V8QI_V8QI:
30799 case V8QI_FTYPE_V4HI_V4HI:
30800 case V8HI_FTYPE_V8HI_V8HI:
30801 case V8HI_FTYPE_V16QI_V16QI:
30802 case V8HI_FTYPE_V4SI_V4SI:
30803 case V8SF_FTYPE_V8SF_V8SF:
30804 case V8SF_FTYPE_V8SF_V8SI:
30805 case V4SI_FTYPE_V4SI_V4SI:
30806 case V4SI_FTYPE_V8HI_V8HI:
30807 case V4SI_FTYPE_V4SF_V4SF:
30808 case V4SI_FTYPE_V2DF_V2DF:
30809 case V4HI_FTYPE_V4HI_V4HI:
30810 case V4HI_FTYPE_V8QI_V8QI:
30811 case V4HI_FTYPE_V2SI_V2SI:
30812 case V4DF_FTYPE_V4DF_V4DF:
30813 case V4DF_FTYPE_V4DF_V4DI:
30814 case V4SF_FTYPE_V4SF_V4SF:
30815 case V4SF_FTYPE_V4SF_V4SI:
30816 case V4SF_FTYPE_V4SF_V2SI:
30817 case V4SF_FTYPE_V4SF_V2DF:
30818 case V4SF_FTYPE_V4SF_DI:
30819 case V4SF_FTYPE_V4SF_SI:
30820 case V2DI_FTYPE_V2DI_V2DI:
30821 case V2DI_FTYPE_V16QI_V16QI:
30822 case V2DI_FTYPE_V4SI_V4SI:
30823 case V2UDI_FTYPE_V4USI_V4USI:
30824 case V2DI_FTYPE_V2DI_V16QI:
30825 case V2DI_FTYPE_V2DF_V2DF:
30826 case V2SI_FTYPE_V2SI_V2SI:
30827 case V2SI_FTYPE_V4HI_V4HI:
30828 case V2SI_FTYPE_V2SF_V2SF:
30829 case V2DF_FTYPE_V2DF_V2DF:
30830 case V2DF_FTYPE_V2DF_V4SF:
30831 case V2DF_FTYPE_V2DF_V2DI:
30832 case V2DF_FTYPE_V2DF_DI:
30833 case V2DF_FTYPE_V2DF_SI:
30834 case V2SF_FTYPE_V2SF_V2SF:
30835 case V1DI_FTYPE_V1DI_V1DI:
30836 case V1DI_FTYPE_V8QI_V8QI:
30837 case V1DI_FTYPE_V2SI_V2SI:
30838 case V32QI_FTYPE_V16HI_V16HI:
30839 case V16HI_FTYPE_V8SI_V8SI:
30840 case V32QI_FTYPE_V32QI_V32QI:
30841 case V16HI_FTYPE_V32QI_V32QI:
30842 case V16HI_FTYPE_V16HI_V16HI:
30843 case V8SI_FTYPE_V4DF_V4DF:
30844 case V8SI_FTYPE_V8SI_V8SI:
30845 case V8SI_FTYPE_V16HI_V16HI:
30846 case V4DI_FTYPE_V4DI_V4DI:
30847 case V4DI_FTYPE_V8SI_V8SI:
30848 case V4UDI_FTYPE_V8USI_V8USI:
30849 if (comparison == UNKNOWN)
30850 return ix86_expand_binop_builtin (icode, exp, target);
30851 nargs = 2;
30852 break;
30853 case V4SF_FTYPE_V4SF_V4SF_SWAP:
30854 case V2DF_FTYPE_V2DF_V2DF_SWAP:
30855 gcc_assert (comparison != UNKNOWN);
30856 nargs = 2;
30857 swap = true;
30858 break;
30859 case V16HI_FTYPE_V16HI_V8HI_COUNT:
30860 case V16HI_FTYPE_V16HI_SI_COUNT:
30861 case V8SI_FTYPE_V8SI_V4SI_COUNT:
30862 case V8SI_FTYPE_V8SI_SI_COUNT:
30863 case V4DI_FTYPE_V4DI_V2DI_COUNT:
30864 case V4DI_FTYPE_V4DI_INT_COUNT:
30865 case V8HI_FTYPE_V8HI_V8HI_COUNT:
30866 case V8HI_FTYPE_V8HI_SI_COUNT:
30867 case V4SI_FTYPE_V4SI_V4SI_COUNT:
30868 case V4SI_FTYPE_V4SI_SI_COUNT:
30869 case V4HI_FTYPE_V4HI_V4HI_COUNT:
30870 case V4HI_FTYPE_V4HI_SI_COUNT:
30871 case V2DI_FTYPE_V2DI_V2DI_COUNT:
30872 case V2DI_FTYPE_V2DI_SI_COUNT:
30873 case V2SI_FTYPE_V2SI_V2SI_COUNT:
30874 case V2SI_FTYPE_V2SI_SI_COUNT:
30875 case V1DI_FTYPE_V1DI_V1DI_COUNT:
30876 case V1DI_FTYPE_V1DI_SI_COUNT:
30877 nargs = 2;
30878 last_arg_count = true;
30879 break;
30880 case UINT64_FTYPE_UINT64_UINT64:
30881 case UINT_FTYPE_UINT_UINT:
30882 case UINT_FTYPE_UINT_USHORT:
30883 case UINT_FTYPE_UINT_UCHAR:
30884 case UINT16_FTYPE_UINT16_INT:
30885 case UINT8_FTYPE_UINT8_INT:
30886 nargs = 2;
30887 break;
30888 case V2DI_FTYPE_V2DI_INT_CONVERT:
30889 nargs = 2;
30890 rmode = V1TImode;
30891 nargs_constant = 1;
30892 break;
30893 case V4DI_FTYPE_V4DI_INT_CONVERT:
30894 nargs = 2;
30895 rmode = V2TImode;
30896 nargs_constant = 1;
30897 break;
30898 case V8HI_FTYPE_V8HI_INT:
30899 case V8HI_FTYPE_V8SF_INT:
30900 case V8HI_FTYPE_V4SF_INT:
30901 case V8SF_FTYPE_V8SF_INT:
30902 case V4SI_FTYPE_V4SI_INT:
30903 case V4SI_FTYPE_V8SI_INT:
30904 case V4HI_FTYPE_V4HI_INT:
30905 case V4DF_FTYPE_V4DF_INT:
30906 case V4SF_FTYPE_V4SF_INT:
30907 case V4SF_FTYPE_V8SF_INT:
30908 case V2DI_FTYPE_V2DI_INT:
30909 case V2DF_FTYPE_V2DF_INT:
30910 case V2DF_FTYPE_V4DF_INT:
30911 case V16HI_FTYPE_V16HI_INT:
30912 case V8SI_FTYPE_V8SI_INT:
30913 case V4DI_FTYPE_V4DI_INT:
30914 case V2DI_FTYPE_V4DI_INT:
30915 nargs = 2;
30916 nargs_constant = 1;
30917 break;
30918 case V16QI_FTYPE_V16QI_V16QI_V16QI:
30919 case V8SF_FTYPE_V8SF_V8SF_V8SF:
30920 case V4DF_FTYPE_V4DF_V4DF_V4DF:
30921 case V4SF_FTYPE_V4SF_V4SF_V4SF:
30922 case V2DF_FTYPE_V2DF_V2DF_V2DF:
30923 case V32QI_FTYPE_V32QI_V32QI_V32QI:
30924 nargs = 3;
30925 break;
30926 case V32QI_FTYPE_V32QI_V32QI_INT:
30927 case V16HI_FTYPE_V16HI_V16HI_INT:
30928 case V16QI_FTYPE_V16QI_V16QI_INT:
30929 case V4DI_FTYPE_V4DI_V4DI_INT:
30930 case V8HI_FTYPE_V8HI_V8HI_INT:
30931 case V8SI_FTYPE_V8SI_V8SI_INT:
30932 case V8SI_FTYPE_V8SI_V4SI_INT:
30933 case V8SF_FTYPE_V8SF_V8SF_INT:
30934 case V8SF_FTYPE_V8SF_V4SF_INT:
30935 case V4SI_FTYPE_V4SI_V4SI_INT:
30936 case V4DF_FTYPE_V4DF_V4DF_INT:
30937 case V4DF_FTYPE_V4DF_V2DF_INT:
30938 case V4SF_FTYPE_V4SF_V4SF_INT:
30939 case V2DI_FTYPE_V2DI_V2DI_INT:
30940 case V4DI_FTYPE_V4DI_V2DI_INT:
30941 case V2DF_FTYPE_V2DF_V2DF_INT:
30942 nargs = 3;
30943 nargs_constant = 1;
30944 break;
30945 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
30946 nargs = 3;
30947 rmode = V4DImode;
30948 nargs_constant = 1;
30949 break;
30950 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
30951 nargs = 3;
30952 rmode = V2DImode;
30953 nargs_constant = 1;
30954 break;
30955 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
30956 nargs = 3;
30957 rmode = DImode;
30958 nargs_constant = 1;
30959 break;
30960 case V2DI_FTYPE_V2DI_UINT_UINT:
30961 nargs = 3;
30962 nargs_constant = 2;
30963 break;
30964 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
30965 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
30966 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
30967 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
30968 nargs = 4;
30969 nargs_constant = 1;
30970 break;
30971 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
30972 nargs = 4;
30973 nargs_constant = 2;
30974 break;
30975 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
30976 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
30977 nargs = 4;
30978 break;
30979 default:
30980 gcc_unreachable ();
30983 gcc_assert (nargs <= ARRAY_SIZE (args));
30985 if (comparison != UNKNOWN)
30987 gcc_assert (nargs == 2);
30988 return ix86_expand_sse_compare (d, exp, target, swap);
30991 if (rmode == VOIDmode || rmode == tmode)
30993 if (optimize
30994 || target == 0
30995 || GET_MODE (target) != tmode
30996 || !insn_p->operand[0].predicate (target, tmode))
30997 target = gen_reg_rtx (tmode);
30998 real_target = target;
31000 else
31002 target = gen_reg_rtx (rmode);
31003 real_target = simplify_gen_subreg (tmode, target, rmode, 0);
31006 for (i = 0; i < nargs; i++)
31008 tree arg = CALL_EXPR_ARG (exp, i);
31009 rtx op = expand_normal (arg);
31010 enum machine_mode mode = insn_p->operand[i + 1].mode;
31011 bool match = insn_p->operand[i + 1].predicate (op, mode);
31013 if (last_arg_count && (i + 1) == nargs)
31015 /* SIMD shift insns take either an 8-bit immediate or
31016 register as count. But builtin functions take int as
31017 count. If count doesn't match, we put it in register. */
31018 if (!match)
31020 op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
31021 if (!insn_p->operand[i + 1].predicate (op, mode))
31022 op = copy_to_reg (op);
31025 else if ((nargs - i) <= nargs_constant)
31027 if (!match)
31028 switch (icode)
31030 case CODE_FOR_avx2_inserti128:
31031 case CODE_FOR_avx2_extracti128:
31032 error ("the last argument must be an 1-bit immediate");
31033 return const0_rtx;
31035 case CODE_FOR_sse4_1_roundsd:
31036 case CODE_FOR_sse4_1_roundss:
31038 case CODE_FOR_sse4_1_roundpd:
31039 case CODE_FOR_sse4_1_roundps:
31040 case CODE_FOR_avx_roundpd256:
31041 case CODE_FOR_avx_roundps256:
31043 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
31044 case CODE_FOR_sse4_1_roundps_sfix:
31045 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
31046 case CODE_FOR_avx_roundps_sfix256:
31048 case CODE_FOR_sse4_1_blendps:
31049 case CODE_FOR_avx_blendpd256:
31050 case CODE_FOR_avx_vpermilv4df:
31051 error ("the last argument must be a 4-bit immediate");
31052 return const0_rtx;
31054 case CODE_FOR_sse4_1_blendpd:
31055 case CODE_FOR_avx_vpermilv2df:
31056 case CODE_FOR_xop_vpermil2v2df3:
31057 case CODE_FOR_xop_vpermil2v4sf3:
31058 case CODE_FOR_xop_vpermil2v4df3:
31059 case CODE_FOR_xop_vpermil2v8sf3:
31060 error ("the last argument must be a 2-bit immediate");
31061 return const0_rtx;
31063 case CODE_FOR_avx_vextractf128v4df:
31064 case CODE_FOR_avx_vextractf128v8sf:
31065 case CODE_FOR_avx_vextractf128v8si:
31066 case CODE_FOR_avx_vinsertf128v4df:
31067 case CODE_FOR_avx_vinsertf128v8sf:
31068 case CODE_FOR_avx_vinsertf128v8si:
31069 error ("the last argument must be a 1-bit immediate");
31070 return const0_rtx;
31072 case CODE_FOR_avx_vmcmpv2df3:
31073 case CODE_FOR_avx_vmcmpv4sf3:
31074 case CODE_FOR_avx_cmpv2df3:
31075 case CODE_FOR_avx_cmpv4sf3:
31076 case CODE_FOR_avx_cmpv4df3:
31077 case CODE_FOR_avx_cmpv8sf3:
31078 error ("the last argument must be a 5-bit immediate");
31079 return const0_rtx;
31081 default:
31082 switch (nargs_constant)
31084 case 2:
31085 if ((nargs - i) == nargs_constant)
31087 error ("the next to last argument must be an 8-bit immediate");
31088 break;
31090 case 1:
31091 error ("the last argument must be an 8-bit immediate");
31092 break;
31093 default:
31094 gcc_unreachable ();
31096 return const0_rtx;
31099 else
31101 if (VECTOR_MODE_P (mode))
31102 op = safe_vector_operand (op, mode);
31104 /* If we aren't optimizing, only allow one memory operand to
31105 be generated. */
31106 if (memory_operand (op, mode))
31107 num_memory++;
31109 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
31111 if (optimize || !match || num_memory > 1)
31112 op = copy_to_mode_reg (mode, op);
31114 else
31116 op = copy_to_reg (op);
31117 op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
31121 args[i].op = op;
31122 args[i].mode = mode;
31125 switch (nargs)
31127 case 1:
31128 pat = GEN_FCN (icode) (real_target, args[0].op);
31129 break;
31130 case 2:
31131 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
31132 break;
31133 case 3:
31134 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
31135 args[2].op);
31136 break;
31137 case 4:
31138 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
31139 args[2].op, args[3].op);
31140 break;
31141 default:
31142 gcc_unreachable ();
31145 if (! pat)
31146 return 0;
31148 emit_insn (pat);
31149 return target;
31152 /* Subroutine of ix86_expand_builtin to take care of special insns
31153 with variable number of operands. */
31155 static rtx
31156 ix86_expand_special_args_builtin (const struct builtin_description *d,
31157 tree exp, rtx target)
31159 tree arg;
31160 rtx pat, op;
31161 unsigned int i, nargs, arg_adjust, memory;
31162 struct
31164 rtx op;
31165 enum machine_mode mode;
31166 } args[3];
31167 enum insn_code icode = d->icode;
31168 bool last_arg_constant = false;
31169 const struct insn_data_d *insn_p = &insn_data[icode];
31170 enum machine_mode tmode = insn_p->operand[0].mode;
31171 enum { load, store } klass;
31173 switch ((enum ix86_builtin_func_type) d->flag)
31175 case VOID_FTYPE_VOID:
31176 emit_insn (GEN_FCN (icode) (target));
31177 return 0;
31178 case VOID_FTYPE_UINT64:
31179 case VOID_FTYPE_UNSIGNED:
31180 nargs = 0;
31181 klass = store;
31182 memory = 0;
31183 break;
31185 case INT_FTYPE_VOID:
31186 case UINT64_FTYPE_VOID:
31187 case UNSIGNED_FTYPE_VOID:
31188 nargs = 0;
31189 klass = load;
31190 memory = 0;
31191 break;
31192 case UINT64_FTYPE_PUNSIGNED:
31193 case V2DI_FTYPE_PV2DI:
31194 case V4DI_FTYPE_PV4DI:
31195 case V32QI_FTYPE_PCCHAR:
31196 case V16QI_FTYPE_PCCHAR:
31197 case V8SF_FTYPE_PCV4SF:
31198 case V8SF_FTYPE_PCFLOAT:
31199 case V4SF_FTYPE_PCFLOAT:
31200 case V4DF_FTYPE_PCV2DF:
31201 case V4DF_FTYPE_PCDOUBLE:
31202 case V2DF_FTYPE_PCDOUBLE:
31203 case VOID_FTYPE_PVOID:
31204 nargs = 1;
31205 klass = load;
31206 memory = 0;
31207 break;
31208 case VOID_FTYPE_PV2SF_V4SF:
31209 case VOID_FTYPE_PV4DI_V4DI:
31210 case VOID_FTYPE_PV2DI_V2DI:
31211 case VOID_FTYPE_PCHAR_V32QI:
31212 case VOID_FTYPE_PCHAR_V16QI:
31213 case VOID_FTYPE_PFLOAT_V8SF:
31214 case VOID_FTYPE_PFLOAT_V4SF:
31215 case VOID_FTYPE_PDOUBLE_V4DF:
31216 case VOID_FTYPE_PDOUBLE_V2DF:
31217 case VOID_FTYPE_PLONGLONG_LONGLONG:
31218 case VOID_FTYPE_PULONGLONG_ULONGLONG:
31219 case VOID_FTYPE_PINT_INT:
31220 nargs = 1;
31221 klass = store;
31222 /* Reserve memory operand for target. */
31223 memory = ARRAY_SIZE (args);
31224 break;
31225 case V4SF_FTYPE_V4SF_PCV2SF:
31226 case V2DF_FTYPE_V2DF_PCDOUBLE:
31227 nargs = 2;
31228 klass = load;
31229 memory = 1;
31230 break;
31231 case V8SF_FTYPE_PCV8SF_V8SI:
31232 case V4DF_FTYPE_PCV4DF_V4DI:
31233 case V4SF_FTYPE_PCV4SF_V4SI:
31234 case V2DF_FTYPE_PCV2DF_V2DI:
31235 case V8SI_FTYPE_PCV8SI_V8SI:
31236 case V4DI_FTYPE_PCV4DI_V4DI:
31237 case V4SI_FTYPE_PCV4SI_V4SI:
31238 case V2DI_FTYPE_PCV2DI_V2DI:
31239 nargs = 2;
31240 klass = load;
31241 memory = 0;
31242 break;
31243 case VOID_FTYPE_PV8SF_V8SI_V8SF:
31244 case VOID_FTYPE_PV4DF_V4DI_V4DF:
31245 case VOID_FTYPE_PV4SF_V4SI_V4SF:
31246 case VOID_FTYPE_PV2DF_V2DI_V2DF:
31247 case VOID_FTYPE_PV8SI_V8SI_V8SI:
31248 case VOID_FTYPE_PV4DI_V4DI_V4DI:
31249 case VOID_FTYPE_PV4SI_V4SI_V4SI:
31250 case VOID_FTYPE_PV2DI_V2DI_V2DI:
31251 nargs = 2;
31252 klass = store;
31253 /* Reserve memory operand for target. */
31254 memory = ARRAY_SIZE (args);
31255 break;
31256 case VOID_FTYPE_UINT_UINT_UINT:
31257 case VOID_FTYPE_UINT64_UINT_UINT:
31258 case UCHAR_FTYPE_UINT_UINT_UINT:
31259 case UCHAR_FTYPE_UINT64_UINT_UINT:
31260 nargs = 3;
31261 klass = load;
31262 memory = ARRAY_SIZE (args);
31263 last_arg_constant = true;
31264 break;
31265 default:
31266 gcc_unreachable ();
31269 gcc_assert (nargs <= ARRAY_SIZE (args));
31271 if (klass == store)
31273 arg = CALL_EXPR_ARG (exp, 0);
31274 op = expand_normal (arg);
31275 gcc_assert (target == 0);
31276 if (memory)
31278 op = force_reg (Pmode, convert_to_mode (Pmode, op, 1));
31279 target = gen_rtx_MEM (tmode, op);
31281 else
31282 target = force_reg (tmode, op);
31283 arg_adjust = 1;
31285 else
31287 arg_adjust = 0;
31288 if (optimize
31289 || target == 0
31290 || !register_operand (target, tmode)
31291 || GET_MODE (target) != tmode)
31292 target = gen_reg_rtx (tmode);
31295 for (i = 0; i < nargs; i++)
31297 enum machine_mode mode = insn_p->operand[i + 1].mode;
31298 bool match;
31300 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
31301 op = expand_normal (arg);
31302 match = insn_p->operand[i + 1].predicate (op, mode);
31304 if (last_arg_constant && (i + 1) == nargs)
31306 if (!match)
31308 if (icode == CODE_FOR_lwp_lwpvalsi3
31309 || icode == CODE_FOR_lwp_lwpinssi3
31310 || icode == CODE_FOR_lwp_lwpvaldi3
31311 || icode == CODE_FOR_lwp_lwpinsdi3)
31312 error ("the last argument must be a 32-bit immediate");
31313 else
31314 error ("the last argument must be an 8-bit immediate");
31315 return const0_rtx;
31318 else
31320 if (i == memory)
31322 /* This must be the memory operand. */
31323 op = force_reg (Pmode, convert_to_mode (Pmode, op, 1));
31324 op = gen_rtx_MEM (mode, op);
31325 gcc_assert (GET_MODE (op) == mode
31326 || GET_MODE (op) == VOIDmode);
31328 else
31330 /* This must be register. */
31331 if (VECTOR_MODE_P (mode))
31332 op = safe_vector_operand (op, mode);
31334 gcc_assert (GET_MODE (op) == mode
31335 || GET_MODE (op) == VOIDmode);
31336 op = copy_to_mode_reg (mode, op);
31340 args[i].op = op;
31341 args[i].mode = mode;
31344 switch (nargs)
31346 case 0:
31347 pat = GEN_FCN (icode) (target);
31348 break;
31349 case 1:
31350 pat = GEN_FCN (icode) (target, args[0].op);
31351 break;
31352 case 2:
31353 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
31354 break;
31355 case 3:
31356 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
31357 break;
31358 default:
31359 gcc_unreachable ();
31362 if (! pat)
31363 return 0;
31364 emit_insn (pat);
31365 return klass == store ? 0 : target;
31368 /* Return the integer constant in ARG. Constrain it to be in the range
31369 of the subparts of VEC_TYPE; issue an error if not. */
31371 static int
31372 get_element_number (tree vec_type, tree arg)
31374 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
31376 if (!host_integerp (arg, 1)
31377 || (elt = tree_low_cst (arg, 1), elt > max))
31379 error ("selector must be an integer constant in the range 0..%wi", max);
31380 return 0;
31383 return elt;
31386 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
31387 ix86_expand_vector_init. We DO have language-level syntax for this, in
31388 the form of (type){ init-list }. Except that since we can't place emms
31389 instructions from inside the compiler, we can't allow the use of MMX
31390 registers unless the user explicitly asks for it. So we do *not* define
31391 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
31392 we have builtins invoked by mmintrin.h that gives us license to emit
31393 these sorts of instructions. */
31395 static rtx
31396 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
31398 enum machine_mode tmode = TYPE_MODE (type);
31399 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
31400 int i, n_elt = GET_MODE_NUNITS (tmode);
31401 rtvec v = rtvec_alloc (n_elt);
31403 gcc_assert (VECTOR_MODE_P (tmode));
31404 gcc_assert (call_expr_nargs (exp) == n_elt);
31406 for (i = 0; i < n_elt; ++i)
31408 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
31409 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
31412 if (!target || !register_operand (target, tmode))
31413 target = gen_reg_rtx (tmode);
31415 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
31416 return target;
31419 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
31420 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
31421 had a language-level syntax for referencing vector elements. */
31423 static rtx
31424 ix86_expand_vec_ext_builtin (tree exp, rtx target)
31426 enum machine_mode tmode, mode0;
31427 tree arg0, arg1;
31428 int elt;
31429 rtx op0;
31431 arg0 = CALL_EXPR_ARG (exp, 0);
31432 arg1 = CALL_EXPR_ARG (exp, 1);
31434 op0 = expand_normal (arg0);
31435 elt = get_element_number (TREE_TYPE (arg0), arg1);
31437 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
31438 mode0 = TYPE_MODE (TREE_TYPE (arg0));
31439 gcc_assert (VECTOR_MODE_P (mode0));
31441 op0 = force_reg (mode0, op0);
31443 if (optimize || !target || !register_operand (target, tmode))
31444 target = gen_reg_rtx (tmode);
31446 ix86_expand_vector_extract (true, target, op0, elt);
31448 return target;
31451 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
31452 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
31453 a language-level syntax for referencing vector elements. */
31455 static rtx
31456 ix86_expand_vec_set_builtin (tree exp)
31458 enum machine_mode tmode, mode1;
31459 tree arg0, arg1, arg2;
31460 int elt;
31461 rtx op0, op1, target;
31463 arg0 = CALL_EXPR_ARG (exp, 0);
31464 arg1 = CALL_EXPR_ARG (exp, 1);
31465 arg2 = CALL_EXPR_ARG (exp, 2);
31467 tmode = TYPE_MODE (TREE_TYPE (arg0));
31468 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
31469 gcc_assert (VECTOR_MODE_P (tmode));
31471 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
31472 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
31473 elt = get_element_number (TREE_TYPE (arg0), arg2);
31475 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
31476 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
31478 op0 = force_reg (tmode, op0);
31479 op1 = force_reg (mode1, op1);
31481 /* OP0 is the source of these builtin functions and shouldn't be
31482 modified. Create a copy, use it and return it as target. */
31483 target = gen_reg_rtx (tmode);
31484 emit_move_insn (target, op0);
31485 ix86_expand_vector_set (true, target, op1, elt);
31487 return target;
31490 /* Expand an expression EXP that calls a built-in function,
31491 with result going to TARGET if that's convenient
31492 (and in mode MODE if that's convenient).
31493 SUBTARGET may be used as the target for computing one of EXP's operands.
31494 IGNORE is nonzero if the value is to be ignored. */
31496 static rtx
31497 ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
31498 enum machine_mode mode ATTRIBUTE_UNUSED,
31499 int ignore ATTRIBUTE_UNUSED)
31501 const struct builtin_description *d;
31502 size_t i;
31503 enum insn_code icode;
31504 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
31505 tree arg0, arg1, arg2, arg3, arg4;
31506 rtx op0, op1, op2, op3, op4, pat, insn;
31507 enum machine_mode mode0, mode1, mode2, mode3, mode4;
31508 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
31510 /* For CPU builtins that can be folded, fold first and expand the fold. */
31511 switch (fcode)
31513 case IX86_BUILTIN_CPU_INIT:
31515 /* Make it call __cpu_indicator_init in libgcc. */
31516 tree call_expr, fndecl, type;
31517 type = build_function_type_list (integer_type_node, NULL_TREE);
31518 fndecl = build_fn_decl ("__cpu_indicator_init", type);
31519 call_expr = build_call_expr (fndecl, 0);
31520 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
31522 case IX86_BUILTIN_CPU_IS:
31523 case IX86_BUILTIN_CPU_SUPPORTS:
31525 tree arg0 = CALL_EXPR_ARG (exp, 0);
31526 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
31527 gcc_assert (fold_expr != NULL_TREE);
31528 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
31532 /* Determine whether the builtin function is available under the current ISA.
31533 Originally the builtin was not created if it wasn't applicable to the
31534 current ISA based on the command line switches. With function specific
31535 options, we need to check in the context of the function making the call
31536 whether it is supported. */
31537 if (ix86_builtins_isa[fcode].isa
31538 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
31540 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
31541 NULL, (enum fpmath_unit) 0, false);
31543 if (!opts)
31544 error ("%qE needs unknown isa option", fndecl);
31545 else
31547 gcc_assert (opts != NULL);
31548 error ("%qE needs isa option %s", fndecl, opts);
31549 free (opts);
31551 return const0_rtx;
31554 switch (fcode)
31556 case IX86_BUILTIN_MASKMOVQ:
31557 case IX86_BUILTIN_MASKMOVDQU:
31558 icode = (fcode == IX86_BUILTIN_MASKMOVQ
31559 ? CODE_FOR_mmx_maskmovq
31560 : CODE_FOR_sse2_maskmovdqu);
31561 /* Note the arg order is different from the operand order. */
31562 arg1 = CALL_EXPR_ARG (exp, 0);
31563 arg2 = CALL_EXPR_ARG (exp, 1);
31564 arg0 = CALL_EXPR_ARG (exp, 2);
31565 op0 = expand_normal (arg0);
31566 op1 = expand_normal (arg1);
31567 op2 = expand_normal (arg2);
31568 mode0 = insn_data[icode].operand[0].mode;
31569 mode1 = insn_data[icode].operand[1].mode;
31570 mode2 = insn_data[icode].operand[2].mode;
31572 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
31573 op0 = gen_rtx_MEM (mode1, op0);
31575 if (!insn_data[icode].operand[0].predicate (op0, mode0))
31576 op0 = copy_to_mode_reg (mode0, op0);
31577 if (!insn_data[icode].operand[1].predicate (op1, mode1))
31578 op1 = copy_to_mode_reg (mode1, op1);
31579 if (!insn_data[icode].operand[2].predicate (op2, mode2))
31580 op2 = copy_to_mode_reg (mode2, op2);
31581 pat = GEN_FCN (icode) (op0, op1, op2);
31582 if (! pat)
31583 return 0;
31584 emit_insn (pat);
31585 return 0;
31587 case IX86_BUILTIN_LDMXCSR:
31588 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
31589 target = assign_386_stack_local (SImode, SLOT_TEMP);
31590 emit_move_insn (target, op0);
31591 emit_insn (gen_sse_ldmxcsr (target));
31592 return 0;
31594 case IX86_BUILTIN_STMXCSR:
31595 target = assign_386_stack_local (SImode, SLOT_TEMP);
31596 emit_insn (gen_sse_stmxcsr (target));
31597 return copy_to_mode_reg (SImode, target);
31599 case IX86_BUILTIN_CLFLUSH:
31600 arg0 = CALL_EXPR_ARG (exp, 0);
31601 op0 = expand_normal (arg0);
31602 icode = CODE_FOR_sse2_clflush;
31603 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
31604 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
31606 emit_insn (gen_sse2_clflush (op0));
31607 return 0;
31609 case IX86_BUILTIN_MONITOR:
31610 arg0 = CALL_EXPR_ARG (exp, 0);
31611 arg1 = CALL_EXPR_ARG (exp, 1);
31612 arg2 = CALL_EXPR_ARG (exp, 2);
31613 op0 = expand_normal (arg0);
31614 op1 = expand_normal (arg1);
31615 op2 = expand_normal (arg2);
31616 if (!REG_P (op0))
31617 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
31618 if (!REG_P (op1))
31619 op1 = copy_to_mode_reg (SImode, op1);
31620 if (!REG_P (op2))
31621 op2 = copy_to_mode_reg (SImode, op2);
31622 emit_insn (ix86_gen_monitor (op0, op1, op2));
31623 return 0;
31625 case IX86_BUILTIN_MWAIT:
31626 arg0 = CALL_EXPR_ARG (exp, 0);
31627 arg1 = CALL_EXPR_ARG (exp, 1);
31628 op0 = expand_normal (arg0);
31629 op1 = expand_normal (arg1);
31630 if (!REG_P (op0))
31631 op0 = copy_to_mode_reg (SImode, op0);
31632 if (!REG_P (op1))
31633 op1 = copy_to_mode_reg (SImode, op1);
31634 emit_insn (gen_sse3_mwait (op0, op1));
31635 return 0;
31637 case IX86_BUILTIN_VEC_INIT_V2SI:
31638 case IX86_BUILTIN_VEC_INIT_V4HI:
31639 case IX86_BUILTIN_VEC_INIT_V8QI:
31640 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
31642 case IX86_BUILTIN_VEC_EXT_V2DF:
31643 case IX86_BUILTIN_VEC_EXT_V2DI:
31644 case IX86_BUILTIN_VEC_EXT_V4SF:
31645 case IX86_BUILTIN_VEC_EXT_V4SI:
31646 case IX86_BUILTIN_VEC_EXT_V8HI:
31647 case IX86_BUILTIN_VEC_EXT_V2SI:
31648 case IX86_BUILTIN_VEC_EXT_V4HI:
31649 case IX86_BUILTIN_VEC_EXT_V16QI:
31650 return ix86_expand_vec_ext_builtin (exp, target);
31652 case IX86_BUILTIN_VEC_SET_V2DI:
31653 case IX86_BUILTIN_VEC_SET_V4SF:
31654 case IX86_BUILTIN_VEC_SET_V4SI:
31655 case IX86_BUILTIN_VEC_SET_V8HI:
31656 case IX86_BUILTIN_VEC_SET_V4HI:
31657 case IX86_BUILTIN_VEC_SET_V16QI:
31658 return ix86_expand_vec_set_builtin (exp);
31660 case IX86_BUILTIN_INFQ:
31661 case IX86_BUILTIN_HUGE_VALQ:
31663 REAL_VALUE_TYPE inf;
31664 rtx tmp;
31666 real_inf (&inf);
31667 tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
31669 tmp = validize_mem (force_const_mem (mode, tmp));
31671 if (target == 0)
31672 target = gen_reg_rtx (mode);
31674 emit_move_insn (target, tmp);
31675 return target;
31678 case IX86_BUILTIN_RDPMC:
31679 case IX86_BUILTIN_RDTSC:
31680 case IX86_BUILTIN_RDTSCP:
31682 op0 = gen_reg_rtx (DImode);
31683 op1 = gen_reg_rtx (DImode);
31685 if (fcode == IX86_BUILTIN_RDPMC)
31687 arg0 = CALL_EXPR_ARG (exp, 0);
31688 op2 = expand_normal (arg0);
31689 if (!register_operand (op2, SImode))
31690 op2 = copy_to_mode_reg (SImode, op2);
31692 insn = (TARGET_64BIT
31693 ? gen_rdpmc_rex64 (op0, op1, op2)
31694 : gen_rdpmc (op0, op2));
31695 emit_insn (insn);
31697 else if (fcode == IX86_BUILTIN_RDTSC)
31699 insn = (TARGET_64BIT
31700 ? gen_rdtsc_rex64 (op0, op1)
31701 : gen_rdtsc (op0));
31702 emit_insn (insn);
31704 else
31706 op2 = gen_reg_rtx (SImode);
31708 insn = (TARGET_64BIT
31709 ? gen_rdtscp_rex64 (op0, op1, op2)
31710 : gen_rdtscp (op0, op2));
31711 emit_insn (insn);
31713 arg0 = CALL_EXPR_ARG (exp, 0);
31714 op4 = expand_normal (arg0);
31715 if (!address_operand (op4, VOIDmode))
31717 op4 = convert_memory_address (Pmode, op4);
31718 op4 = copy_addr_to_reg (op4);
31720 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
31723 if (target == 0)
31724 target = gen_reg_rtx (mode);
31726 if (TARGET_64BIT)
31728 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
31729 op1, 1, OPTAB_DIRECT);
31730 op0 = expand_simple_binop (DImode, IOR, op0, op1,
31731 op0, 1, OPTAB_DIRECT);
31734 emit_move_insn (target, op0);
31735 return target;
31737 case IX86_BUILTIN_FXSAVE:
31738 case IX86_BUILTIN_FXRSTOR:
31739 case IX86_BUILTIN_FXSAVE64:
31740 case IX86_BUILTIN_FXRSTOR64:
31741 switch (fcode)
31743 case IX86_BUILTIN_FXSAVE:
31744 icode = CODE_FOR_fxsave;
31745 break;
31746 case IX86_BUILTIN_FXRSTOR:
31747 icode = CODE_FOR_fxrstor;
31748 break;
31749 case IX86_BUILTIN_FXSAVE64:
31750 icode = CODE_FOR_fxsave64;
31751 break;
31752 case IX86_BUILTIN_FXRSTOR64:
31753 icode = CODE_FOR_fxrstor64;
31754 break;
31755 default:
31756 gcc_unreachable ();
31759 arg0 = CALL_EXPR_ARG (exp, 0);
31760 op0 = expand_normal (arg0);
31762 if (!address_operand (op0, VOIDmode))
31764 op0 = convert_memory_address (Pmode, op0);
31765 op0 = copy_addr_to_reg (op0);
31767 op0 = gen_rtx_MEM (BLKmode, op0);
31769 pat = GEN_FCN (icode) (op0);
31770 if (pat)
31771 emit_insn (pat);
31772 return 0;
31774 case IX86_BUILTIN_XSAVE:
31775 case IX86_BUILTIN_XRSTOR:
31776 case IX86_BUILTIN_XSAVE64:
31777 case IX86_BUILTIN_XRSTOR64:
31778 case IX86_BUILTIN_XSAVEOPT:
31779 case IX86_BUILTIN_XSAVEOPT64:
31780 arg0 = CALL_EXPR_ARG (exp, 0);
31781 arg1 = CALL_EXPR_ARG (exp, 1);
31782 op0 = expand_normal (arg0);
31783 op1 = expand_normal (arg1);
31785 if (!address_operand (op0, VOIDmode))
31787 op0 = convert_memory_address (Pmode, op0);
31788 op0 = copy_addr_to_reg (op0);
31790 op0 = gen_rtx_MEM (BLKmode, op0);
31792 op1 = force_reg (DImode, op1);
31794 if (TARGET_64BIT)
31796 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
31797 NULL, 1, OPTAB_DIRECT);
31798 switch (fcode)
31800 case IX86_BUILTIN_XSAVE:
31801 icode = CODE_FOR_xsave_rex64;
31802 break;
31803 case IX86_BUILTIN_XRSTOR:
31804 icode = CODE_FOR_xrstor_rex64;
31805 break;
31806 case IX86_BUILTIN_XSAVE64:
31807 icode = CODE_FOR_xsave64;
31808 break;
31809 case IX86_BUILTIN_XRSTOR64:
31810 icode = CODE_FOR_xrstor64;
31811 break;
31812 case IX86_BUILTIN_XSAVEOPT:
31813 icode = CODE_FOR_xsaveopt_rex64;
31814 break;
31815 case IX86_BUILTIN_XSAVEOPT64:
31816 icode = CODE_FOR_xsaveopt64;
31817 break;
31818 default:
31819 gcc_unreachable ();
31822 op2 = gen_lowpart (SImode, op2);
31823 op1 = gen_lowpart (SImode, op1);
31824 pat = GEN_FCN (icode) (op0, op1, op2);
31826 else
31828 switch (fcode)
31830 case IX86_BUILTIN_XSAVE:
31831 icode = CODE_FOR_xsave;
31832 break;
31833 case IX86_BUILTIN_XRSTOR:
31834 icode = CODE_FOR_xrstor;
31835 break;
31836 case IX86_BUILTIN_XSAVEOPT:
31837 icode = CODE_FOR_xsaveopt;
31838 break;
31839 default:
31840 gcc_unreachable ();
31842 pat = GEN_FCN (icode) (op0, op1);
31845 if (pat)
31846 emit_insn (pat);
31847 return 0;
31849 case IX86_BUILTIN_LLWPCB:
31850 arg0 = CALL_EXPR_ARG (exp, 0);
31851 op0 = expand_normal (arg0);
31852 icode = CODE_FOR_lwp_llwpcb;
31853 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
31854 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
31855 emit_insn (gen_lwp_llwpcb (op0));
31856 return 0;
31858 case IX86_BUILTIN_SLWPCB:
31859 icode = CODE_FOR_lwp_slwpcb;
31860 if (!target
31861 || !insn_data[icode].operand[0].predicate (target, Pmode))
31862 target = gen_reg_rtx (Pmode);
31863 emit_insn (gen_lwp_slwpcb (target));
31864 return target;
31866 case IX86_BUILTIN_BEXTRI32:
31867 case IX86_BUILTIN_BEXTRI64:
31868 arg0 = CALL_EXPR_ARG (exp, 0);
31869 arg1 = CALL_EXPR_ARG (exp, 1);
31870 op0 = expand_normal (arg0);
31871 op1 = expand_normal (arg1);
31872 icode = (fcode == IX86_BUILTIN_BEXTRI32
31873 ? CODE_FOR_tbm_bextri_si
31874 : CODE_FOR_tbm_bextri_di);
31875 if (!CONST_INT_P (op1))
31877 error ("last argument must be an immediate");
31878 return const0_rtx;
31880 else
31882 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
31883 unsigned char lsb_index = INTVAL (op1) & 0xFF;
31884 op1 = GEN_INT (length);
31885 op2 = GEN_INT (lsb_index);
31886 pat = GEN_FCN (icode) (target, op0, op1, op2);
31887 if (pat)
31888 emit_insn (pat);
31889 return target;
31892 case IX86_BUILTIN_RDRAND16_STEP:
31893 icode = CODE_FOR_rdrandhi_1;
31894 mode0 = HImode;
31895 goto rdrand_step;
31897 case IX86_BUILTIN_RDRAND32_STEP:
31898 icode = CODE_FOR_rdrandsi_1;
31899 mode0 = SImode;
31900 goto rdrand_step;
31902 case IX86_BUILTIN_RDRAND64_STEP:
31903 icode = CODE_FOR_rdranddi_1;
31904 mode0 = DImode;
31906 rdrand_step:
31907 op0 = gen_reg_rtx (mode0);
31908 emit_insn (GEN_FCN (icode) (op0));
31910 arg0 = CALL_EXPR_ARG (exp, 0);
31911 op1 = expand_normal (arg0);
31912 if (!address_operand (op1, VOIDmode))
31914 op1 = convert_memory_address (Pmode, op1);
31915 op1 = copy_addr_to_reg (op1);
31917 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
31919 op1 = gen_reg_rtx (SImode);
31920 emit_move_insn (op1, CONST1_RTX (SImode));
31922 /* Emit SImode conditional move. */
31923 if (mode0 == HImode)
31925 op2 = gen_reg_rtx (SImode);
31926 emit_insn (gen_zero_extendhisi2 (op2, op0));
31928 else if (mode0 == SImode)
31929 op2 = op0;
31930 else
31931 op2 = gen_rtx_SUBREG (SImode, op0, 0);
31933 if (target == 0)
31934 target = gen_reg_rtx (SImode);
31936 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
31937 const0_rtx);
31938 emit_insn (gen_rtx_SET (VOIDmode, target,
31939 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
31940 return target;
31942 case IX86_BUILTIN_RDSEED16_STEP:
31943 icode = CODE_FOR_rdseedhi_1;
31944 mode0 = HImode;
31945 goto rdseed_step;
31947 case IX86_BUILTIN_RDSEED32_STEP:
31948 icode = CODE_FOR_rdseedsi_1;
31949 mode0 = SImode;
31950 goto rdseed_step;
31952 case IX86_BUILTIN_RDSEED64_STEP:
31953 icode = CODE_FOR_rdseeddi_1;
31954 mode0 = DImode;
31956 rdseed_step:
31957 op0 = gen_reg_rtx (mode0);
31958 emit_insn (GEN_FCN (icode) (op0));
31960 arg0 = CALL_EXPR_ARG (exp, 0);
31961 op1 = expand_normal (arg0);
31962 if (!address_operand (op1, VOIDmode))
31964 op1 = convert_memory_address (Pmode, op1);
31965 op1 = copy_addr_to_reg (op1);
31967 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
31969 op2 = gen_reg_rtx (QImode);
31971 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
31972 const0_rtx);
31973 emit_insn (gen_rtx_SET (VOIDmode, op2, pat));
31975 if (target == 0)
31976 target = gen_reg_rtx (SImode);
31978 emit_insn (gen_zero_extendqisi2 (target, op2));
31979 return target;
31981 case IX86_BUILTIN_ADDCARRYX32:
31982 icode = TARGET_ADX ? CODE_FOR_adcxsi3 : CODE_FOR_addsi3_carry;
31983 mode0 = SImode;
31984 goto addcarryx;
31986 case IX86_BUILTIN_ADDCARRYX64:
31987 icode = TARGET_ADX ? CODE_FOR_adcxdi3 : CODE_FOR_adddi3_carry;
31988 mode0 = DImode;
31990 addcarryx:
31991 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
31992 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
31993 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
31994 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
31996 op0 = gen_reg_rtx (QImode);
31998 /* Generate CF from input operand. */
31999 op1 = expand_normal (arg0);
32000 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
32001 emit_insn (gen_addqi3_cc (op0, op1, constm1_rtx));
32003 /* Gen ADCX instruction to compute X+Y+CF. */
32004 op2 = expand_normal (arg1);
32005 op3 = expand_normal (arg2);
32007 if (!REG_P (op2))
32008 op2 = copy_to_mode_reg (mode0, op2);
32009 if (!REG_P (op3))
32010 op3 = copy_to_mode_reg (mode0, op3);
32012 op0 = gen_reg_rtx (mode0);
32014 op4 = gen_rtx_REG (CCCmode, FLAGS_REG);
32015 pat = gen_rtx_LTU (VOIDmode, op4, const0_rtx);
32016 emit_insn (GEN_FCN (icode) (op0, op2, op3, op4, pat));
32018 /* Store the result. */
32019 op4 = expand_normal (arg3);
32020 if (!address_operand (op4, VOIDmode))
32022 op4 = convert_memory_address (Pmode, op4);
32023 op4 = copy_addr_to_reg (op4);
32025 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
32027 /* Return current CF value. */
32028 if (target == 0)
32029 target = gen_reg_rtx (QImode);
32031 PUT_MODE (pat, QImode);
32032 emit_insn (gen_rtx_SET (VOIDmode, target, pat));
32033 return target;
32035 case IX86_BUILTIN_GATHERSIV2DF:
32036 icode = CODE_FOR_avx2_gathersiv2df;
32037 goto gather_gen;
32038 case IX86_BUILTIN_GATHERSIV4DF:
32039 icode = CODE_FOR_avx2_gathersiv4df;
32040 goto gather_gen;
32041 case IX86_BUILTIN_GATHERDIV2DF:
32042 icode = CODE_FOR_avx2_gatherdiv2df;
32043 goto gather_gen;
32044 case IX86_BUILTIN_GATHERDIV4DF:
32045 icode = CODE_FOR_avx2_gatherdiv4df;
32046 goto gather_gen;
32047 case IX86_BUILTIN_GATHERSIV4SF:
32048 icode = CODE_FOR_avx2_gathersiv4sf;
32049 goto gather_gen;
32050 case IX86_BUILTIN_GATHERSIV8SF:
32051 icode = CODE_FOR_avx2_gathersiv8sf;
32052 goto gather_gen;
32053 case IX86_BUILTIN_GATHERDIV4SF:
32054 icode = CODE_FOR_avx2_gatherdiv4sf;
32055 goto gather_gen;
32056 case IX86_BUILTIN_GATHERDIV8SF:
32057 icode = CODE_FOR_avx2_gatherdiv8sf;
32058 goto gather_gen;
32059 case IX86_BUILTIN_GATHERSIV2DI:
32060 icode = CODE_FOR_avx2_gathersiv2di;
32061 goto gather_gen;
32062 case IX86_BUILTIN_GATHERSIV4DI:
32063 icode = CODE_FOR_avx2_gathersiv4di;
32064 goto gather_gen;
32065 case IX86_BUILTIN_GATHERDIV2DI:
32066 icode = CODE_FOR_avx2_gatherdiv2di;
32067 goto gather_gen;
32068 case IX86_BUILTIN_GATHERDIV4DI:
32069 icode = CODE_FOR_avx2_gatherdiv4di;
32070 goto gather_gen;
32071 case IX86_BUILTIN_GATHERSIV4SI:
32072 icode = CODE_FOR_avx2_gathersiv4si;
32073 goto gather_gen;
32074 case IX86_BUILTIN_GATHERSIV8SI:
32075 icode = CODE_FOR_avx2_gathersiv8si;
32076 goto gather_gen;
32077 case IX86_BUILTIN_GATHERDIV4SI:
32078 icode = CODE_FOR_avx2_gatherdiv4si;
32079 goto gather_gen;
32080 case IX86_BUILTIN_GATHERDIV8SI:
32081 icode = CODE_FOR_avx2_gatherdiv8si;
32082 goto gather_gen;
32083 case IX86_BUILTIN_GATHERALTSIV4DF:
32084 icode = CODE_FOR_avx2_gathersiv4df;
32085 goto gather_gen;
32086 case IX86_BUILTIN_GATHERALTDIV8SF:
32087 icode = CODE_FOR_avx2_gatherdiv8sf;
32088 goto gather_gen;
32089 case IX86_BUILTIN_GATHERALTSIV4DI:
32090 icode = CODE_FOR_avx2_gathersiv4di;
32091 goto gather_gen;
32092 case IX86_BUILTIN_GATHERALTDIV8SI:
32093 icode = CODE_FOR_avx2_gatherdiv8si;
32094 goto gather_gen;
32096 gather_gen:
32097 arg0 = CALL_EXPR_ARG (exp, 0);
32098 arg1 = CALL_EXPR_ARG (exp, 1);
32099 arg2 = CALL_EXPR_ARG (exp, 2);
32100 arg3 = CALL_EXPR_ARG (exp, 3);
32101 arg4 = CALL_EXPR_ARG (exp, 4);
32102 op0 = expand_normal (arg0);
32103 op1 = expand_normal (arg1);
32104 op2 = expand_normal (arg2);
32105 op3 = expand_normal (arg3);
32106 op4 = expand_normal (arg4);
32107 /* Note the arg order is different from the operand order. */
32108 mode0 = insn_data[icode].operand[1].mode;
32109 mode2 = insn_data[icode].operand[3].mode;
32110 mode3 = insn_data[icode].operand[4].mode;
32111 mode4 = insn_data[icode].operand[5].mode;
32113 if (target == NULL_RTX
32114 || GET_MODE (target) != insn_data[icode].operand[0].mode)
32115 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
32116 else
32117 subtarget = target;
32119 if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
32120 || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
32122 rtx half = gen_reg_rtx (V4SImode);
32123 if (!nonimmediate_operand (op2, V8SImode))
32124 op2 = copy_to_mode_reg (V8SImode, op2);
32125 emit_insn (gen_vec_extract_lo_v8si (half, op2));
32126 op2 = half;
32128 else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
32129 || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
32131 rtx (*gen) (rtx, rtx);
32132 rtx half = gen_reg_rtx (mode0);
32133 if (mode0 == V4SFmode)
32134 gen = gen_vec_extract_lo_v8sf;
32135 else
32136 gen = gen_vec_extract_lo_v8si;
32137 if (!nonimmediate_operand (op0, GET_MODE (op0)))
32138 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
32139 emit_insn (gen (half, op0));
32140 op0 = half;
32141 if (!nonimmediate_operand (op3, GET_MODE (op3)))
32142 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
32143 emit_insn (gen (half, op3));
32144 op3 = half;
32147 /* Force memory operand only with base register here. But we
32148 don't want to do it on memory operand for other builtin
32149 functions. */
32150 op1 = force_reg (Pmode, convert_to_mode (Pmode, op1, 1));
32152 if (!insn_data[icode].operand[1].predicate (op0, mode0))
32153 op0 = copy_to_mode_reg (mode0, op0);
32154 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
32155 op1 = copy_to_mode_reg (Pmode, op1);
32156 if (!insn_data[icode].operand[3].predicate (op2, mode2))
32157 op2 = copy_to_mode_reg (mode2, op2);
32158 if (!insn_data[icode].operand[4].predicate (op3, mode3))
32159 op3 = copy_to_mode_reg (mode3, op3);
32160 if (!insn_data[icode].operand[5].predicate (op4, mode4))
32162 error ("last argument must be scale 1, 2, 4, 8");
32163 return const0_rtx;
32166 /* Optimize. If mask is known to have all high bits set,
32167 replace op0 with pc_rtx to signal that the instruction
32168 overwrites the whole destination and doesn't use its
32169 previous contents. */
32170 if (optimize)
32172 if (TREE_CODE (arg3) == VECTOR_CST)
32174 unsigned int negative = 0;
32175 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
32177 tree cst = VECTOR_CST_ELT (arg3, i);
32178 if (TREE_CODE (cst) == INTEGER_CST
32179 && tree_int_cst_sign_bit (cst))
32180 negative++;
32181 else if (TREE_CODE (cst) == REAL_CST
32182 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
32183 negative++;
32185 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
32186 op0 = pc_rtx;
32188 else if (TREE_CODE (arg3) == SSA_NAME)
32190 /* Recognize also when mask is like:
32191 __v2df src = _mm_setzero_pd ();
32192 __v2df mask = _mm_cmpeq_pd (src, src);
32194 __v8sf src = _mm256_setzero_ps ();
32195 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
32196 as that is a cheaper way to load all ones into
32197 a register than having to load a constant from
32198 memory. */
32199 gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
32200 if (is_gimple_call (def_stmt))
32202 tree fndecl = gimple_call_fndecl (def_stmt);
32203 if (fndecl
32204 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
32205 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
32207 case IX86_BUILTIN_CMPPD:
32208 case IX86_BUILTIN_CMPPS:
32209 case IX86_BUILTIN_CMPPD256:
32210 case IX86_BUILTIN_CMPPS256:
32211 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
32212 break;
32213 /* FALLTHRU */
32214 case IX86_BUILTIN_CMPEQPD:
32215 case IX86_BUILTIN_CMPEQPS:
32216 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
32217 && initializer_zerop (gimple_call_arg (def_stmt,
32218 1)))
32219 op0 = pc_rtx;
32220 break;
32221 default:
32222 break;
32228 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
32229 if (! pat)
32230 return const0_rtx;
32231 emit_insn (pat);
32233 if (fcode == IX86_BUILTIN_GATHERDIV8SF
32234 || fcode == IX86_BUILTIN_GATHERDIV8SI)
32236 enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
32237 ? V4SFmode : V4SImode;
32238 if (target == NULL_RTX)
32239 target = gen_reg_rtx (tmode);
32240 if (tmode == V4SFmode)
32241 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
32242 else
32243 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
32245 else
32246 target = subtarget;
32248 return target;
32250 case IX86_BUILTIN_XABORT:
32251 icode = CODE_FOR_xabort;
32252 arg0 = CALL_EXPR_ARG (exp, 0);
32253 op0 = expand_normal (arg0);
32254 mode0 = insn_data[icode].operand[0].mode;
32255 if (!insn_data[icode].operand[0].predicate (op0, mode0))
32257 error ("the xabort's argument must be an 8-bit immediate");
32258 return const0_rtx;
32260 emit_insn (gen_xabort (op0));
32261 return 0;
32263 default:
32264 break;
32267 for (i = 0, d = bdesc_special_args;
32268 i < ARRAY_SIZE (bdesc_special_args);
32269 i++, d++)
32270 if (d->code == fcode)
32271 return ix86_expand_special_args_builtin (d, exp, target);
32273 for (i = 0, d = bdesc_args;
32274 i < ARRAY_SIZE (bdesc_args);
32275 i++, d++)
32276 if (d->code == fcode)
32277 switch (fcode)
32279 case IX86_BUILTIN_FABSQ:
32280 case IX86_BUILTIN_COPYSIGNQ:
32281 if (!TARGET_SSE)
32282 /* Emit a normal call if SSE isn't available. */
32283 return expand_call (exp, target, ignore);
32284 default:
32285 return ix86_expand_args_builtin (d, exp, target);
32288 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
32289 if (d->code == fcode)
32290 return ix86_expand_sse_comi (d, exp, target);
32292 for (i = 0, d = bdesc_pcmpestr;
32293 i < ARRAY_SIZE (bdesc_pcmpestr);
32294 i++, d++)
32295 if (d->code == fcode)
32296 return ix86_expand_sse_pcmpestr (d, exp, target);
32298 for (i = 0, d = bdesc_pcmpistr;
32299 i < ARRAY_SIZE (bdesc_pcmpistr);
32300 i++, d++)
32301 if (d->code == fcode)
32302 return ix86_expand_sse_pcmpistr (d, exp, target);
32304 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
32305 if (d->code == fcode)
32306 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
32307 (enum ix86_builtin_func_type)
32308 d->flag, d->comparison);
32310 gcc_unreachable ();
32313 /* Returns a function decl for a vectorized version of the builtin function
32314 with builtin function code FN and the result vector type TYPE, or NULL_TREE
32315 if it is not available. */
32317 static tree
32318 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
32319 tree type_in)
32321 enum machine_mode in_mode, out_mode;
32322 int in_n, out_n;
32323 enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
32325 if (TREE_CODE (type_out) != VECTOR_TYPE
32326 || TREE_CODE (type_in) != VECTOR_TYPE
32327 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
32328 return NULL_TREE;
32330 out_mode = TYPE_MODE (TREE_TYPE (type_out));
32331 out_n = TYPE_VECTOR_SUBPARTS (type_out);
32332 in_mode = TYPE_MODE (TREE_TYPE (type_in));
32333 in_n = TYPE_VECTOR_SUBPARTS (type_in);
32335 switch (fn)
32337 case BUILT_IN_SQRT:
32338 if (out_mode == DFmode && in_mode == DFmode)
32340 if (out_n == 2 && in_n == 2)
32341 return ix86_builtins[IX86_BUILTIN_SQRTPD];
32342 else if (out_n == 4 && in_n == 4)
32343 return ix86_builtins[IX86_BUILTIN_SQRTPD256];
32345 break;
32347 case BUILT_IN_SQRTF:
32348 if (out_mode == SFmode && in_mode == SFmode)
32350 if (out_n == 4 && in_n == 4)
32351 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
32352 else if (out_n == 8 && in_n == 8)
32353 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
32355 break;
32357 case BUILT_IN_IFLOOR:
32358 case BUILT_IN_LFLOOR:
32359 case BUILT_IN_LLFLOOR:
32360 /* The round insn does not trap on denormals. */
32361 if (flag_trapping_math || !TARGET_ROUND)
32362 break;
32364 if (out_mode == SImode && in_mode == DFmode)
32366 if (out_n == 4 && in_n == 2)
32367 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX];
32368 else if (out_n == 8 && in_n == 4)
32369 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256];
32371 break;
32373 case BUILT_IN_IFLOORF:
32374 case BUILT_IN_LFLOORF:
32375 case BUILT_IN_LLFLOORF:
32376 /* The round insn does not trap on denormals. */
32377 if (flag_trapping_math || !TARGET_ROUND)
32378 break;
32380 if (out_mode == SImode && in_mode == SFmode)
32382 if (out_n == 4 && in_n == 4)
32383 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX];
32384 else if (out_n == 8 && in_n == 8)
32385 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX256];
32387 break;
32389 case BUILT_IN_ICEIL:
32390 case BUILT_IN_LCEIL:
32391 case BUILT_IN_LLCEIL:
32392 /* The round insn does not trap on denormals. */
32393 if (flag_trapping_math || !TARGET_ROUND)
32394 break;
32396 if (out_mode == SImode && in_mode == DFmode)
32398 if (out_n == 4 && in_n == 2)
32399 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX];
32400 else if (out_n == 8 && in_n == 4)
32401 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256];
32403 break;
32405 case BUILT_IN_ICEILF:
32406 case BUILT_IN_LCEILF:
32407 case BUILT_IN_LLCEILF:
32408 /* The round insn does not trap on denormals. */
32409 if (flag_trapping_math || !TARGET_ROUND)
32410 break;
32412 if (out_mode == SImode && in_mode == SFmode)
32414 if (out_n == 4 && in_n == 4)
32415 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX];
32416 else if (out_n == 8 && in_n == 8)
32417 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX256];
32419 break;
32421 case BUILT_IN_IRINT:
32422 case BUILT_IN_LRINT:
32423 case BUILT_IN_LLRINT:
32424 if (out_mode == SImode && in_mode == DFmode)
32426 if (out_n == 4 && in_n == 2)
32427 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
32428 else if (out_n == 8 && in_n == 4)
32429 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX256];
32431 break;
32433 case BUILT_IN_IRINTF:
32434 case BUILT_IN_LRINTF:
32435 case BUILT_IN_LLRINTF:
32436 if (out_mode == SImode && in_mode == SFmode)
32438 if (out_n == 4 && in_n == 4)
32439 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
32440 else if (out_n == 8 && in_n == 8)
32441 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
32443 break;
32445 case BUILT_IN_IROUND:
32446 case BUILT_IN_LROUND:
32447 case BUILT_IN_LLROUND:
32448 /* The round insn does not trap on denormals. */
32449 if (flag_trapping_math || !TARGET_ROUND)
32450 break;
32452 if (out_mode == SImode && in_mode == DFmode)
32454 if (out_n == 4 && in_n == 2)
32455 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX];
32456 else if (out_n == 8 && in_n == 4)
32457 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256];
32459 break;
32461 case BUILT_IN_IROUNDF:
32462 case BUILT_IN_LROUNDF:
32463 case BUILT_IN_LLROUNDF:
32464 /* The round insn does not trap on denormals. */
32465 if (flag_trapping_math || !TARGET_ROUND)
32466 break;
32468 if (out_mode == SImode && in_mode == SFmode)
32470 if (out_n == 4 && in_n == 4)
32471 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX];
32472 else if (out_n == 8 && in_n == 8)
32473 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX256];
32475 break;
32477 case BUILT_IN_COPYSIGN:
32478 if (out_mode == DFmode && in_mode == DFmode)
32480 if (out_n == 2 && in_n == 2)
32481 return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
32482 else if (out_n == 4 && in_n == 4)
32483 return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
32485 break;
32487 case BUILT_IN_COPYSIGNF:
32488 if (out_mode == SFmode && in_mode == SFmode)
32490 if (out_n == 4 && in_n == 4)
32491 return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
32492 else if (out_n == 8 && in_n == 8)
32493 return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
32495 break;
32497 case BUILT_IN_FLOOR:
32498 /* The round insn does not trap on denormals. */
32499 if (flag_trapping_math || !TARGET_ROUND)
32500 break;
32502 if (out_mode == DFmode && in_mode == DFmode)
32504 if (out_n == 2 && in_n == 2)
32505 return ix86_builtins[IX86_BUILTIN_FLOORPD];
32506 else if (out_n == 4 && in_n == 4)
32507 return ix86_builtins[IX86_BUILTIN_FLOORPD256];
32509 break;
32511 case BUILT_IN_FLOORF:
32512 /* The round insn does not trap on denormals. */
32513 if (flag_trapping_math || !TARGET_ROUND)
32514 break;
32516 if (out_mode == SFmode && in_mode == SFmode)
32518 if (out_n == 4 && in_n == 4)
32519 return ix86_builtins[IX86_BUILTIN_FLOORPS];
32520 else if (out_n == 8 && in_n == 8)
32521 return ix86_builtins[IX86_BUILTIN_FLOORPS256];
32523 break;
32525 case BUILT_IN_CEIL:
32526 /* The round insn does not trap on denormals. */
32527 if (flag_trapping_math || !TARGET_ROUND)
32528 break;
32530 if (out_mode == DFmode && in_mode == DFmode)
32532 if (out_n == 2 && in_n == 2)
32533 return ix86_builtins[IX86_BUILTIN_CEILPD];
32534 else if (out_n == 4 && in_n == 4)
32535 return ix86_builtins[IX86_BUILTIN_CEILPD256];
32537 break;
32539 case BUILT_IN_CEILF:
32540 /* The round insn does not trap on denormals. */
32541 if (flag_trapping_math || !TARGET_ROUND)
32542 break;
32544 if (out_mode == SFmode && in_mode == SFmode)
32546 if (out_n == 4 && in_n == 4)
32547 return ix86_builtins[IX86_BUILTIN_CEILPS];
32548 else if (out_n == 8 && in_n == 8)
32549 return ix86_builtins[IX86_BUILTIN_CEILPS256];
32551 break;
32553 case BUILT_IN_TRUNC:
32554 /* The round insn does not trap on denormals. */
32555 if (flag_trapping_math || !TARGET_ROUND)
32556 break;
32558 if (out_mode == DFmode && in_mode == DFmode)
32560 if (out_n == 2 && in_n == 2)
32561 return ix86_builtins[IX86_BUILTIN_TRUNCPD];
32562 else if (out_n == 4 && in_n == 4)
32563 return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
32565 break;
32567 case BUILT_IN_TRUNCF:
32568 /* The round insn does not trap on denormals. */
32569 if (flag_trapping_math || !TARGET_ROUND)
32570 break;
32572 if (out_mode == SFmode && in_mode == SFmode)
32574 if (out_n == 4 && in_n == 4)
32575 return ix86_builtins[IX86_BUILTIN_TRUNCPS];
32576 else if (out_n == 8 && in_n == 8)
32577 return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
32579 break;
32581 case BUILT_IN_RINT:
32582 /* The round insn does not trap on denormals. */
32583 if (flag_trapping_math || !TARGET_ROUND)
32584 break;
32586 if (out_mode == DFmode && in_mode == DFmode)
32588 if (out_n == 2 && in_n == 2)
32589 return ix86_builtins[IX86_BUILTIN_RINTPD];
32590 else if (out_n == 4 && in_n == 4)
32591 return ix86_builtins[IX86_BUILTIN_RINTPD256];
32593 break;
32595 case BUILT_IN_RINTF:
32596 /* The round insn does not trap on denormals. */
32597 if (flag_trapping_math || !TARGET_ROUND)
32598 break;
32600 if (out_mode == SFmode && in_mode == SFmode)
32602 if (out_n == 4 && in_n == 4)
32603 return ix86_builtins[IX86_BUILTIN_RINTPS];
32604 else if (out_n == 8 && in_n == 8)
32605 return ix86_builtins[IX86_BUILTIN_RINTPS256];
32607 break;
32609 case BUILT_IN_ROUND:
32610 /* The round insn does not trap on denormals. */
32611 if (flag_trapping_math || !TARGET_ROUND)
32612 break;
32614 if (out_mode == DFmode && in_mode == DFmode)
32616 if (out_n == 2 && in_n == 2)
32617 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ];
32618 else if (out_n == 4 && in_n == 4)
32619 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ256];
32621 break;
32623 case BUILT_IN_ROUNDF:
32624 /* The round insn does not trap on denormals. */
32625 if (flag_trapping_math || !TARGET_ROUND)
32626 break;
32628 if (out_mode == SFmode && in_mode == SFmode)
32630 if (out_n == 4 && in_n == 4)
32631 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ];
32632 else if (out_n == 8 && in_n == 8)
32633 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ256];
32635 break;
32637 case BUILT_IN_FMA:
32638 if (out_mode == DFmode && in_mode == DFmode)
32640 if (out_n == 2 && in_n == 2)
32641 return ix86_builtins[IX86_BUILTIN_VFMADDPD];
32642 if (out_n == 4 && in_n == 4)
32643 return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
32645 break;
32647 case BUILT_IN_FMAF:
32648 if (out_mode == SFmode && in_mode == SFmode)
32650 if (out_n == 4 && in_n == 4)
32651 return ix86_builtins[IX86_BUILTIN_VFMADDPS];
32652 if (out_n == 8 && in_n == 8)
32653 return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
32655 break;
32657 default:
32658 break;
32661 /* Dispatch to a handler for a vectorization library. */
32662 if (ix86_veclib_handler)
32663 return ix86_veclib_handler ((enum built_in_function) fn, type_out,
32664 type_in);
32666 return NULL_TREE;
32669 /* Handler for an SVML-style interface to
32670 a library with vectorized intrinsics. */
32672 static tree
32673 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
32675 char name[20];
32676 tree fntype, new_fndecl, args;
32677 unsigned arity;
32678 const char *bname;
32679 enum machine_mode el_mode, in_mode;
32680 int n, in_n;
32682 /* The SVML is suitable for unsafe math only. */
32683 if (!flag_unsafe_math_optimizations)
32684 return NULL_TREE;
32686 el_mode = TYPE_MODE (TREE_TYPE (type_out));
32687 n = TYPE_VECTOR_SUBPARTS (type_out);
32688 in_mode = TYPE_MODE (TREE_TYPE (type_in));
32689 in_n = TYPE_VECTOR_SUBPARTS (type_in);
32690 if (el_mode != in_mode
32691 || n != in_n)
32692 return NULL_TREE;
32694 switch (fn)
32696 case BUILT_IN_EXP:
32697 case BUILT_IN_LOG:
32698 case BUILT_IN_LOG10:
32699 case BUILT_IN_POW:
32700 case BUILT_IN_TANH:
32701 case BUILT_IN_TAN:
32702 case BUILT_IN_ATAN:
32703 case BUILT_IN_ATAN2:
32704 case BUILT_IN_ATANH:
32705 case BUILT_IN_CBRT:
32706 case BUILT_IN_SINH:
32707 case BUILT_IN_SIN:
32708 case BUILT_IN_ASINH:
32709 case BUILT_IN_ASIN:
32710 case BUILT_IN_COSH:
32711 case BUILT_IN_COS:
32712 case BUILT_IN_ACOSH:
32713 case BUILT_IN_ACOS:
32714 if (el_mode != DFmode || n != 2)
32715 return NULL_TREE;
32716 break;
32718 case BUILT_IN_EXPF:
32719 case BUILT_IN_LOGF:
32720 case BUILT_IN_LOG10F:
32721 case BUILT_IN_POWF:
32722 case BUILT_IN_TANHF:
32723 case BUILT_IN_TANF:
32724 case BUILT_IN_ATANF:
32725 case BUILT_IN_ATAN2F:
32726 case BUILT_IN_ATANHF:
32727 case BUILT_IN_CBRTF:
32728 case BUILT_IN_SINHF:
32729 case BUILT_IN_SINF:
32730 case BUILT_IN_ASINHF:
32731 case BUILT_IN_ASINF:
32732 case BUILT_IN_COSHF:
32733 case BUILT_IN_COSF:
32734 case BUILT_IN_ACOSHF:
32735 case BUILT_IN_ACOSF:
32736 if (el_mode != SFmode || n != 4)
32737 return NULL_TREE;
32738 break;
32740 default:
32741 return NULL_TREE;
32744 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
32746 if (fn == BUILT_IN_LOGF)
32747 strcpy (name, "vmlsLn4");
32748 else if (fn == BUILT_IN_LOG)
32749 strcpy (name, "vmldLn2");
32750 else if (n == 4)
32752 sprintf (name, "vmls%s", bname+10);
32753 name[strlen (name)-1] = '4';
32755 else
32756 sprintf (name, "vmld%s2", bname+10);
32758 /* Convert to uppercase. */
32759 name[4] &= ~0x20;
32761 arity = 0;
32762 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
32763 args;
32764 args = TREE_CHAIN (args))
32765 arity++;
32767 if (arity == 1)
32768 fntype = build_function_type_list (type_out, type_in, NULL);
32769 else
32770 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
32772 /* Build a function declaration for the vectorized function. */
32773 new_fndecl = build_decl (BUILTINS_LOCATION,
32774 FUNCTION_DECL, get_identifier (name), fntype);
32775 TREE_PUBLIC (new_fndecl) = 1;
32776 DECL_EXTERNAL (new_fndecl) = 1;
32777 DECL_IS_NOVOPS (new_fndecl) = 1;
32778 TREE_READONLY (new_fndecl) = 1;
32780 return new_fndecl;
32783 /* Handler for an ACML-style interface to
32784 a library with vectorized intrinsics. */
32786 static tree
32787 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
32789 char name[20] = "__vr.._";
32790 tree fntype, new_fndecl, args;
32791 unsigned arity;
32792 const char *bname;
32793 enum machine_mode el_mode, in_mode;
32794 int n, in_n;
32796 /* The ACML is 64bits only and suitable for unsafe math only as
32797 it does not correctly support parts of IEEE with the required
32798 precision such as denormals. */
32799 if (!TARGET_64BIT
32800 || !flag_unsafe_math_optimizations)
32801 return NULL_TREE;
32803 el_mode = TYPE_MODE (TREE_TYPE (type_out));
32804 n = TYPE_VECTOR_SUBPARTS (type_out);
32805 in_mode = TYPE_MODE (TREE_TYPE (type_in));
32806 in_n = TYPE_VECTOR_SUBPARTS (type_in);
32807 if (el_mode != in_mode
32808 || n != in_n)
32809 return NULL_TREE;
32811 switch (fn)
32813 case BUILT_IN_SIN:
32814 case BUILT_IN_COS:
32815 case BUILT_IN_EXP:
32816 case BUILT_IN_LOG:
32817 case BUILT_IN_LOG2:
32818 case BUILT_IN_LOG10:
32819 name[4] = 'd';
32820 name[5] = '2';
32821 if (el_mode != DFmode
32822 || n != 2)
32823 return NULL_TREE;
32824 break;
32826 case BUILT_IN_SINF:
32827 case BUILT_IN_COSF:
32828 case BUILT_IN_EXPF:
32829 case BUILT_IN_POWF:
32830 case BUILT_IN_LOGF:
32831 case BUILT_IN_LOG2F:
32832 case BUILT_IN_LOG10F:
32833 name[4] = 's';
32834 name[5] = '4';
32835 if (el_mode != SFmode
32836 || n != 4)
32837 return NULL_TREE;
32838 break;
32840 default:
32841 return NULL_TREE;
32844 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
32845 sprintf (name + 7, "%s", bname+10);
32847 arity = 0;
32848 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
32849 args;
32850 args = TREE_CHAIN (args))
32851 arity++;
32853 if (arity == 1)
32854 fntype = build_function_type_list (type_out, type_in, NULL);
32855 else
32856 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
32858 /* Build a function declaration for the vectorized function. */
32859 new_fndecl = build_decl (BUILTINS_LOCATION,
32860 FUNCTION_DECL, get_identifier (name), fntype);
32861 TREE_PUBLIC (new_fndecl) = 1;
32862 DECL_EXTERNAL (new_fndecl) = 1;
32863 DECL_IS_NOVOPS (new_fndecl) = 1;
32864 TREE_READONLY (new_fndecl) = 1;
32866 return new_fndecl;
32869 /* Returns a decl of a function that implements gather load with
32870 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
32871 Return NULL_TREE if it is not available. */
32873 static tree
32874 ix86_vectorize_builtin_gather (const_tree mem_vectype,
32875 const_tree index_type, int scale)
32877 bool si;
32878 enum ix86_builtins code;
32880 if (! TARGET_AVX2)
32881 return NULL_TREE;
32883 if ((TREE_CODE (index_type) != INTEGER_TYPE
32884 && !POINTER_TYPE_P (index_type))
32885 || (TYPE_MODE (index_type) != SImode
32886 && TYPE_MODE (index_type) != DImode))
32887 return NULL_TREE;
32889 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
32890 return NULL_TREE;
32892 /* v*gather* insn sign extends index to pointer mode. */
32893 if (TYPE_PRECISION (index_type) < POINTER_SIZE
32894 && TYPE_UNSIGNED (index_type))
32895 return NULL_TREE;
32897 if (scale <= 0
32898 || scale > 8
32899 || (scale & (scale - 1)) != 0)
32900 return NULL_TREE;
32902 si = TYPE_MODE (index_type) == SImode;
32903 switch (TYPE_MODE (mem_vectype))
32905 case V2DFmode:
32906 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
32907 break;
32908 case V4DFmode:
32909 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
32910 break;
32911 case V2DImode:
32912 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
32913 break;
32914 case V4DImode:
32915 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
32916 break;
32917 case V4SFmode:
32918 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
32919 break;
32920 case V8SFmode:
32921 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
32922 break;
32923 case V4SImode:
32924 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
32925 break;
32926 case V8SImode:
32927 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
32928 break;
32929 default:
32930 return NULL_TREE;
32933 return ix86_builtins[code];
32936 /* Returns a code for a target-specific builtin that implements
32937 reciprocal of the function, or NULL_TREE if not available. */
32939 static tree
32940 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
32941 bool sqrt ATTRIBUTE_UNUSED)
32943 if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
32944 && flag_finite_math_only && !flag_trapping_math
32945 && flag_unsafe_math_optimizations))
32946 return NULL_TREE;
32948 if (md_fn)
32949 /* Machine dependent builtins. */
32950 switch (fn)
32952 /* Vectorized version of sqrt to rsqrt conversion. */
32953 case IX86_BUILTIN_SQRTPS_NR:
32954 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
32956 case IX86_BUILTIN_SQRTPS_NR256:
32957 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
32959 default:
32960 return NULL_TREE;
32962 else
32963 /* Normal builtins. */
32964 switch (fn)
32966 /* Sqrt to rsqrt conversion. */
32967 case BUILT_IN_SQRTF:
32968 return ix86_builtins[IX86_BUILTIN_RSQRTF];
32970 default:
32971 return NULL_TREE;
32975 /* Helper for avx_vpermilps256_operand et al. This is also used by
32976 the expansion functions to turn the parallel back into a mask.
32977 The return value is 0 for no match and the imm8+1 for a match. */
32980 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
32982 unsigned i, nelt = GET_MODE_NUNITS (mode);
32983 unsigned mask = 0;
32984 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
32986 if (XVECLEN (par, 0) != (int) nelt)
32987 return 0;
32989 /* Validate that all of the elements are constants, and not totally
32990 out of range. Copy the data into an integral array to make the
32991 subsequent checks easier. */
32992 for (i = 0; i < nelt; ++i)
32994 rtx er = XVECEXP (par, 0, i);
32995 unsigned HOST_WIDE_INT ei;
32997 if (!CONST_INT_P (er))
32998 return 0;
32999 ei = INTVAL (er);
33000 if (ei >= nelt)
33001 return 0;
33002 ipar[i] = ei;
33005 switch (mode)
33007 case V4DFmode:
33008 /* In the 256-bit DFmode case, we can only move elements within
33009 a 128-bit lane. */
33010 for (i = 0; i < 2; ++i)
33012 if (ipar[i] >= 2)
33013 return 0;
33014 mask |= ipar[i] << i;
33016 for (i = 2; i < 4; ++i)
33018 if (ipar[i] < 2)
33019 return 0;
33020 mask |= (ipar[i] - 2) << i;
33022 break;
33024 case V8SFmode:
33025 /* In the 256-bit SFmode case, we have full freedom of movement
33026 within the low 128-bit lane, but the high 128-bit lane must
33027 mirror the exact same pattern. */
33028 for (i = 0; i < 4; ++i)
33029 if (ipar[i] + 4 != ipar[i + 4])
33030 return 0;
33031 nelt = 4;
33032 /* FALLTHRU */
33034 case V2DFmode:
33035 case V4SFmode:
33036 /* In the 128-bit case, we've full freedom in the placement of
33037 the elements from the source operand. */
33038 for (i = 0; i < nelt; ++i)
33039 mask |= ipar[i] << (i * (nelt / 2));
33040 break;
33042 default:
33043 gcc_unreachable ();
33046 /* Make sure success has a non-zero value by adding one. */
33047 return mask + 1;
33050 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
33051 the expansion functions to turn the parallel back into a mask.
33052 The return value is 0 for no match and the imm8+1 for a match. */
33055 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
33057 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
33058 unsigned mask = 0;
33059 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
33061 if (XVECLEN (par, 0) != (int) nelt)
33062 return 0;
33064 /* Validate that all of the elements are constants, and not totally
33065 out of range. Copy the data into an integral array to make the
33066 subsequent checks easier. */
33067 for (i = 0; i < nelt; ++i)
33069 rtx er = XVECEXP (par, 0, i);
33070 unsigned HOST_WIDE_INT ei;
33072 if (!CONST_INT_P (er))
33073 return 0;
33074 ei = INTVAL (er);
33075 if (ei >= 2 * nelt)
33076 return 0;
33077 ipar[i] = ei;
33080 /* Validate that the halves of the permute are halves. */
33081 for (i = 0; i < nelt2 - 1; ++i)
33082 if (ipar[i] + 1 != ipar[i + 1])
33083 return 0;
33084 for (i = nelt2; i < nelt - 1; ++i)
33085 if (ipar[i] + 1 != ipar[i + 1])
33086 return 0;
33088 /* Reconstruct the mask. */
33089 for (i = 0; i < 2; ++i)
33091 unsigned e = ipar[i * nelt2];
33092 if (e % nelt2)
33093 return 0;
33094 e /= nelt2;
33095 mask |= e << (i * 4);
33098 /* Make sure success has a non-zero value by adding one. */
33099 return mask + 1;
33102 /* Store OPERAND to the memory after reload is completed. This means
33103 that we can't easily use assign_stack_local. */
33105 ix86_force_to_memory (enum machine_mode mode, rtx operand)
33107 rtx result;
33109 gcc_assert (reload_completed);
33110 if (ix86_using_red_zone ())
33112 result = gen_rtx_MEM (mode,
33113 gen_rtx_PLUS (Pmode,
33114 stack_pointer_rtx,
33115 GEN_INT (-RED_ZONE_SIZE)));
33116 emit_move_insn (result, operand);
33118 else if (TARGET_64BIT)
33120 switch (mode)
33122 case HImode:
33123 case SImode:
33124 operand = gen_lowpart (DImode, operand);
33125 /* FALLTHRU */
33126 case DImode:
33127 emit_insn (
33128 gen_rtx_SET (VOIDmode,
33129 gen_rtx_MEM (DImode,
33130 gen_rtx_PRE_DEC (DImode,
33131 stack_pointer_rtx)),
33132 operand));
33133 break;
33134 default:
33135 gcc_unreachable ();
33137 result = gen_rtx_MEM (mode, stack_pointer_rtx);
33139 else
33141 switch (mode)
33143 case DImode:
33145 rtx operands[2];
33146 split_double_mode (mode, &operand, 1, operands, operands + 1);
33147 emit_insn (
33148 gen_rtx_SET (VOIDmode,
33149 gen_rtx_MEM (SImode,
33150 gen_rtx_PRE_DEC (Pmode,
33151 stack_pointer_rtx)),
33152 operands[1]));
33153 emit_insn (
33154 gen_rtx_SET (VOIDmode,
33155 gen_rtx_MEM (SImode,
33156 gen_rtx_PRE_DEC (Pmode,
33157 stack_pointer_rtx)),
33158 operands[0]));
33160 break;
33161 case HImode:
33162 /* Store HImodes as SImodes. */
33163 operand = gen_lowpart (SImode, operand);
33164 /* FALLTHRU */
33165 case SImode:
33166 emit_insn (
33167 gen_rtx_SET (VOIDmode,
33168 gen_rtx_MEM (GET_MODE (operand),
33169 gen_rtx_PRE_DEC (SImode,
33170 stack_pointer_rtx)),
33171 operand));
33172 break;
33173 default:
33174 gcc_unreachable ();
33176 result = gen_rtx_MEM (mode, stack_pointer_rtx);
33178 return result;
33181 /* Free operand from the memory. */
33182 void
33183 ix86_free_from_memory (enum machine_mode mode)
33185 if (!ix86_using_red_zone ())
33187 int size;
33189 if (mode == DImode || TARGET_64BIT)
33190 size = 8;
33191 else
33192 size = 4;
33193 /* Use LEA to deallocate stack space. In peephole2 it will be converted
33194 to pop or add instruction if registers are available. */
33195 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
33196 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
33197 GEN_INT (size))));
33201 /* Return a register priority for hard reg REGNO. */
33202 static int
33203 ix86_register_priority (int hard_regno)
33205 /* ebp and r13 as the base always wants a displacement, r12 as the
33206 base always wants an index. So discourage their usage in an
33207 address. */
33208 if (hard_regno == R12_REG || hard_regno == R13_REG)
33209 return 0;
33210 if (hard_regno == BP_REG)
33211 return 1;
33212 /* New x86-64 int registers result in bigger code size. Discourage
33213 them. */
33214 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
33215 return 2;
33216 /* New x86-64 SSE registers result in bigger code size. Discourage
33217 them. */
33218 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
33219 return 2;
33220 /* Usage of AX register results in smaller code. Prefer it. */
33221 if (hard_regno == 0)
33222 return 4;
33223 return 3;
33226 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
33228 Put float CONST_DOUBLE in the constant pool instead of fp regs.
33229 QImode must go into class Q_REGS.
33230 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
33231 movdf to do mem-to-mem moves through integer regs. */
33233 static reg_class_t
33234 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
33236 enum machine_mode mode = GET_MODE (x);
33238 /* We're only allowed to return a subclass of CLASS. Many of the
33239 following checks fail for NO_REGS, so eliminate that early. */
33240 if (regclass == NO_REGS)
33241 return NO_REGS;
33243 /* All classes can load zeros. */
33244 if (x == CONST0_RTX (mode))
33245 return regclass;
33247 /* Force constants into memory if we are loading a (nonzero) constant into
33248 an MMX or SSE register. This is because there are no MMX/SSE instructions
33249 to load from a constant. */
33250 if (CONSTANT_P (x)
33251 && (MAYBE_MMX_CLASS_P (regclass) || MAYBE_SSE_CLASS_P (regclass)))
33252 return NO_REGS;
33254 /* Prefer SSE regs only, if we can use them for math. */
33255 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
33256 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
33258 /* Floating-point constants need more complex checks. */
33259 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
33261 /* General regs can load everything. */
33262 if (reg_class_subset_p (regclass, GENERAL_REGS))
33263 return regclass;
33265 /* Floats can load 0 and 1 plus some others. Note that we eliminated
33266 zero above. We only want to wind up preferring 80387 registers if
33267 we plan on doing computation with them. */
33268 if (TARGET_80387
33269 && standard_80387_constant_p (x) > 0)
33271 /* Limit class to non-sse. */
33272 if (regclass == FLOAT_SSE_REGS)
33273 return FLOAT_REGS;
33274 if (regclass == FP_TOP_SSE_REGS)
33275 return FP_TOP_REG;
33276 if (regclass == FP_SECOND_SSE_REGS)
33277 return FP_SECOND_REG;
33278 if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
33279 return regclass;
33282 return NO_REGS;
33285 /* Generally when we see PLUS here, it's the function invariant
33286 (plus soft-fp const_int). Which can only be computed into general
33287 regs. */
33288 if (GET_CODE (x) == PLUS)
33289 return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
33291 /* QImode constants are easy to load, but non-constant QImode data
33292 must go into Q_REGS. */
33293 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
33295 if (reg_class_subset_p (regclass, Q_REGS))
33296 return regclass;
33297 if (reg_class_subset_p (Q_REGS, regclass))
33298 return Q_REGS;
33299 return NO_REGS;
33302 return regclass;
33305 /* Discourage putting floating-point values in SSE registers unless
33306 SSE math is being used, and likewise for the 387 registers. */
33307 static reg_class_t
33308 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
33310 enum machine_mode mode = GET_MODE (x);
33312 /* Restrict the output reload class to the register bank that we are doing
33313 math on. If we would like not to return a subset of CLASS, reject this
33314 alternative: if reload cannot do this, it will still use its choice. */
33315 mode = GET_MODE (x);
33316 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
33317 return MAYBE_SSE_CLASS_P (regclass) ? SSE_REGS : NO_REGS;
33319 if (X87_FLOAT_MODE_P (mode))
33321 if (regclass == FP_TOP_SSE_REGS)
33322 return FP_TOP_REG;
33323 else if (regclass == FP_SECOND_SSE_REGS)
33324 return FP_SECOND_REG;
33325 else
33326 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
33329 return regclass;
33332 static reg_class_t
33333 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
33334 enum machine_mode mode, secondary_reload_info *sri)
33336 /* Double-word spills from general registers to non-offsettable memory
33337 references (zero-extended addresses) require special handling. */
33338 if (TARGET_64BIT
33339 && MEM_P (x)
33340 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
33341 && rclass == GENERAL_REGS
33342 && !offsettable_memref_p (x))
33344 sri->icode = (in_p
33345 ? CODE_FOR_reload_noff_load
33346 : CODE_FOR_reload_noff_store);
33347 /* Add the cost of moving address to a temporary. */
33348 sri->extra_cost = 1;
33350 return NO_REGS;
33353 /* QImode spills from non-QI registers require
33354 intermediate register on 32bit targets. */
33355 if (!TARGET_64BIT
33356 && !in_p && mode == QImode
33357 && (rclass == GENERAL_REGS
33358 || rclass == LEGACY_REGS
33359 || rclass == NON_Q_REGS
33360 || rclass == SIREG
33361 || rclass == DIREG
33362 || rclass == INDEX_REGS))
33364 int regno;
33366 if (REG_P (x))
33367 regno = REGNO (x);
33368 else
33369 regno = -1;
33371 if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
33372 regno = true_regnum (x);
33374 /* Return Q_REGS if the operand is in memory. */
33375 if (regno == -1)
33376 return Q_REGS;
33379 /* This condition handles corner case where an expression involving
33380 pointers gets vectorized. We're trying to use the address of a
33381 stack slot as a vector initializer.
33383 (set (reg:V2DI 74 [ vect_cst_.2 ])
33384 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
33386 Eventually frame gets turned into sp+offset like this:
33388 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33389 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
33390 (const_int 392 [0x188]))))
33392 That later gets turned into:
33394 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33395 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
33396 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
33398 We'll have the following reload recorded:
33400 Reload 0: reload_in (DI) =
33401 (plus:DI (reg/f:DI 7 sp)
33402 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
33403 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33404 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
33405 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
33406 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33407 reload_reg_rtx: (reg:V2DI 22 xmm1)
33409 Which isn't going to work since SSE instructions can't handle scalar
33410 additions. Returning GENERAL_REGS forces the addition into integer
33411 register and reload can handle subsequent reloads without problems. */
33413 if (in_p && GET_CODE (x) == PLUS
33414 && SSE_CLASS_P (rclass)
33415 && SCALAR_INT_MODE_P (mode))
33416 return GENERAL_REGS;
33418 return NO_REGS;
33421 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
33423 static bool
33424 ix86_class_likely_spilled_p (reg_class_t rclass)
33426 switch (rclass)
33428 case AREG:
33429 case DREG:
33430 case CREG:
33431 case BREG:
33432 case AD_REGS:
33433 case SIREG:
33434 case DIREG:
33435 case SSE_FIRST_REG:
33436 case FP_TOP_REG:
33437 case FP_SECOND_REG:
33438 return true;
33440 default:
33441 break;
33444 return false;
33447 /* If we are copying between general and FP registers, we need a memory
33448 location. The same is true for SSE and MMX registers.
33450 To optimize register_move_cost performance, allow inline variant.
33452 The macro can't work reliably when one of the CLASSES is class containing
33453 registers from multiple units (SSE, MMX, integer). We avoid this by never
33454 combining those units in single alternative in the machine description.
33455 Ensure that this constraint holds to avoid unexpected surprises.
33457 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
33458 enforce these sanity checks. */
33460 static inline bool
33461 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
33462 enum machine_mode mode, int strict)
33464 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
33465 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
33466 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
33467 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
33468 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
33469 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
33471 gcc_assert (!strict || lra_in_progress);
33472 return true;
33475 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
33476 return true;
33478 /* ??? This is a lie. We do have moves between mmx/general, and for
33479 mmx/sse2. But by saying we need secondary memory we discourage the
33480 register allocator from using the mmx registers unless needed. */
33481 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
33482 return true;
33484 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
33486 /* SSE1 doesn't have any direct moves from other classes. */
33487 if (!TARGET_SSE2)
33488 return true;
33490 /* If the target says that inter-unit moves are more expensive
33491 than moving through memory, then don't generate them. */
33492 if (!TARGET_INTER_UNIT_MOVES)
33493 return true;
33495 /* Between SSE and general, we have moves no larger than word size. */
33496 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
33497 return true;
33500 return false;
33503 bool
33504 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
33505 enum machine_mode mode, int strict)
33507 return inline_secondary_memory_needed (class1, class2, mode, strict);
33510 /* Implement the TARGET_CLASS_MAX_NREGS hook.
33512 On the 80386, this is the size of MODE in words,
33513 except in the FP regs, where a single reg is always enough. */
33515 static unsigned char
33516 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
33518 if (MAYBE_INTEGER_CLASS_P (rclass))
33520 if (mode == XFmode)
33521 return (TARGET_64BIT ? 2 : 3);
33522 else if (mode == XCmode)
33523 return (TARGET_64BIT ? 4 : 6);
33524 else
33525 return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
33527 else
33529 if (COMPLEX_MODE_P (mode))
33530 return 2;
33531 else
33532 return 1;
33536 /* Return true if the registers in CLASS cannot represent the change from
33537 modes FROM to TO. */
33539 bool
33540 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
33541 enum reg_class regclass)
33543 if (from == to)
33544 return false;
33546 /* x87 registers can't do subreg at all, as all values are reformatted
33547 to extended precision. */
33548 if (MAYBE_FLOAT_CLASS_P (regclass))
33549 return true;
33551 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
33553 /* Vector registers do not support QI or HImode loads. If we don't
33554 disallow a change to these modes, reload will assume it's ok to
33555 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
33556 the vec_dupv4hi pattern. */
33557 if (GET_MODE_SIZE (from) < 4)
33558 return true;
33560 /* Vector registers do not support subreg with nonzero offsets, which
33561 are otherwise valid for integer registers. Since we can't see
33562 whether we have a nonzero offset from here, prohibit all
33563 nonparadoxical subregs changing size. */
33564 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
33565 return true;
33568 return false;
33571 /* Return the cost of moving data of mode M between a
33572 register and memory. A value of 2 is the default; this cost is
33573 relative to those in `REGISTER_MOVE_COST'.
33575 This function is used extensively by register_move_cost that is used to
33576 build tables at startup. Make it inline in this case.
33577 When IN is 2, return maximum of in and out move cost.
33579 If moving between registers and memory is more expensive than
33580 between two registers, you should define this macro to express the
33581 relative cost.
33583 Model also increased moving costs of QImode registers in non
33584 Q_REGS classes.
33586 static inline int
33587 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
33588 int in)
33590 int cost;
33591 if (FLOAT_CLASS_P (regclass))
33593 int index;
33594 switch (mode)
33596 case SFmode:
33597 index = 0;
33598 break;
33599 case DFmode:
33600 index = 1;
33601 break;
33602 case XFmode:
33603 index = 2;
33604 break;
33605 default:
33606 return 100;
33608 if (in == 2)
33609 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
33610 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
33612 if (SSE_CLASS_P (regclass))
33614 int index;
33615 switch (GET_MODE_SIZE (mode))
33617 case 4:
33618 index = 0;
33619 break;
33620 case 8:
33621 index = 1;
33622 break;
33623 case 16:
33624 index = 2;
33625 break;
33626 default:
33627 return 100;
33629 if (in == 2)
33630 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
33631 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
33633 if (MMX_CLASS_P (regclass))
33635 int index;
33636 switch (GET_MODE_SIZE (mode))
33638 case 4:
33639 index = 0;
33640 break;
33641 case 8:
33642 index = 1;
33643 break;
33644 default:
33645 return 100;
33647 if (in)
33648 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
33649 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
33651 switch (GET_MODE_SIZE (mode))
33653 case 1:
33654 if (Q_CLASS_P (regclass) || TARGET_64BIT)
33656 if (!in)
33657 return ix86_cost->int_store[0];
33658 if (TARGET_PARTIAL_REG_DEPENDENCY
33659 && optimize_function_for_speed_p (cfun))
33660 cost = ix86_cost->movzbl_load;
33661 else
33662 cost = ix86_cost->int_load[0];
33663 if (in == 2)
33664 return MAX (cost, ix86_cost->int_store[0]);
33665 return cost;
33667 else
33669 if (in == 2)
33670 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
33671 if (in)
33672 return ix86_cost->movzbl_load;
33673 else
33674 return ix86_cost->int_store[0] + 4;
33676 break;
33677 case 2:
33678 if (in == 2)
33679 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
33680 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
33681 default:
33682 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
33683 if (mode == TFmode)
33684 mode = XFmode;
33685 if (in == 2)
33686 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
33687 else if (in)
33688 cost = ix86_cost->int_load[2];
33689 else
33690 cost = ix86_cost->int_store[2];
33691 return (cost * (((int) GET_MODE_SIZE (mode)
33692 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
33696 static int
33697 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
33698 bool in)
33700 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
33704 /* Return the cost of moving data from a register in class CLASS1 to
33705 one in class CLASS2.
33707 It is not required that the cost always equal 2 when FROM is the same as TO;
33708 on some machines it is expensive to move between registers if they are not
33709 general registers. */
33711 static int
33712 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
33713 reg_class_t class2_i)
33715 enum reg_class class1 = (enum reg_class) class1_i;
33716 enum reg_class class2 = (enum reg_class) class2_i;
33718 /* In case we require secondary memory, compute cost of the store followed
33719 by load. In order to avoid bad register allocation choices, we need
33720 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
33722 if (inline_secondary_memory_needed (class1, class2, mode, 0))
33724 int cost = 1;
33726 cost += inline_memory_move_cost (mode, class1, 2);
33727 cost += inline_memory_move_cost (mode, class2, 2);
33729 /* In case of copying from general_purpose_register we may emit multiple
33730 stores followed by single load causing memory size mismatch stall.
33731 Count this as arbitrarily high cost of 20. */
33732 if (targetm.class_max_nregs (class1, mode)
33733 > targetm.class_max_nregs (class2, mode))
33734 cost += 20;
33736 /* In the case of FP/MMX moves, the registers actually overlap, and we
33737 have to switch modes in order to treat them differently. */
33738 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
33739 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
33740 cost += 20;
33742 return cost;
33745 /* Moves between SSE/MMX and integer unit are expensive. */
33746 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
33747 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
33749 /* ??? By keeping returned value relatively high, we limit the number
33750 of moves between integer and MMX/SSE registers for all targets.
33751 Additionally, high value prevents problem with x86_modes_tieable_p(),
33752 where integer modes in MMX/SSE registers are not tieable
33753 because of missing QImode and HImode moves to, from or between
33754 MMX/SSE registers. */
33755 return MAX (8, ix86_cost->mmxsse_to_integer);
33757 if (MAYBE_FLOAT_CLASS_P (class1))
33758 return ix86_cost->fp_move;
33759 if (MAYBE_SSE_CLASS_P (class1))
33760 return ix86_cost->sse_move;
33761 if (MAYBE_MMX_CLASS_P (class1))
33762 return ix86_cost->mmx_move;
33763 return 2;
33766 /* Return TRUE if hard register REGNO can hold a value of machine-mode
33767 MODE. */
33769 bool
33770 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
33772 /* Flags and only flags can only hold CCmode values. */
33773 if (CC_REGNO_P (regno))
33774 return GET_MODE_CLASS (mode) == MODE_CC;
33775 if (GET_MODE_CLASS (mode) == MODE_CC
33776 || GET_MODE_CLASS (mode) == MODE_RANDOM
33777 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
33778 return false;
33779 if (STACK_REGNO_P (regno))
33780 return VALID_FP_MODE_P (mode);
33781 if (SSE_REGNO_P (regno))
33783 /* We implement the move patterns for all vector modes into and
33784 out of SSE registers, even when no operation instructions
33785 are available. OImode move is available only when AVX is
33786 enabled. */
33787 return ((TARGET_AVX && mode == OImode)
33788 || VALID_AVX256_REG_MODE (mode)
33789 || VALID_SSE_REG_MODE (mode)
33790 || VALID_SSE2_REG_MODE (mode)
33791 || VALID_MMX_REG_MODE (mode)
33792 || VALID_MMX_REG_MODE_3DNOW (mode));
33794 if (MMX_REGNO_P (regno))
33796 /* We implement the move patterns for 3DNOW modes even in MMX mode,
33797 so if the register is available at all, then we can move data of
33798 the given mode into or out of it. */
33799 return (VALID_MMX_REG_MODE (mode)
33800 || VALID_MMX_REG_MODE_3DNOW (mode));
33803 if (mode == QImode)
33805 /* Take care for QImode values - they can be in non-QI regs,
33806 but then they do cause partial register stalls. */
33807 if (TARGET_64BIT || QI_REGNO_P (regno))
33808 return true;
33809 if (!TARGET_PARTIAL_REG_STALL)
33810 return true;
33811 return !can_create_pseudo_p ();
33813 /* We handle both integer and floats in the general purpose registers. */
33814 else if (VALID_INT_MODE_P (mode))
33815 return true;
33816 else if (VALID_FP_MODE_P (mode))
33817 return true;
33818 else if (VALID_DFP_MODE_P (mode))
33819 return true;
33820 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
33821 on to use that value in smaller contexts, this can easily force a
33822 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
33823 supporting DImode, allow it. */
33824 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
33825 return true;
33827 return false;
33830 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
33831 tieable integer mode. */
33833 static bool
33834 ix86_tieable_integer_mode_p (enum machine_mode mode)
33836 switch (mode)
33838 case HImode:
33839 case SImode:
33840 return true;
33842 case QImode:
33843 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
33845 case DImode:
33846 return TARGET_64BIT;
33848 default:
33849 return false;
33853 /* Return true if MODE1 is accessible in a register that can hold MODE2
33854 without copying. That is, all register classes that can hold MODE2
33855 can also hold MODE1. */
33857 bool
33858 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
33860 if (mode1 == mode2)
33861 return true;
33863 if (ix86_tieable_integer_mode_p (mode1)
33864 && ix86_tieable_integer_mode_p (mode2))
33865 return true;
33867 /* MODE2 being XFmode implies fp stack or general regs, which means we
33868 can tie any smaller floating point modes to it. Note that we do not
33869 tie this with TFmode. */
33870 if (mode2 == XFmode)
33871 return mode1 == SFmode || mode1 == DFmode;
33873 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
33874 that we can tie it with SFmode. */
33875 if (mode2 == DFmode)
33876 return mode1 == SFmode;
33878 /* If MODE2 is only appropriate for an SSE register, then tie with
33879 any other mode acceptable to SSE registers. */
33880 if (GET_MODE_SIZE (mode2) == 32
33881 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
33882 return (GET_MODE_SIZE (mode1) == 32
33883 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
33884 if (GET_MODE_SIZE (mode2) == 16
33885 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
33886 return (GET_MODE_SIZE (mode1) == 16
33887 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
33889 /* If MODE2 is appropriate for an MMX register, then tie
33890 with any other mode acceptable to MMX registers. */
33891 if (GET_MODE_SIZE (mode2) == 8
33892 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
33893 return (GET_MODE_SIZE (mode1) == 8
33894 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
33896 return false;
33899 /* Return the cost of moving between two registers of mode MODE. */
33901 static int
33902 ix86_set_reg_reg_cost (enum machine_mode mode)
33904 unsigned int units = UNITS_PER_WORD;
33906 switch (GET_MODE_CLASS (mode))
33908 default:
33909 break;
33911 case MODE_CC:
33912 units = GET_MODE_SIZE (CCmode);
33913 break;
33915 case MODE_FLOAT:
33916 if ((TARGET_SSE && mode == TFmode)
33917 || (TARGET_80387 && mode == XFmode)
33918 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
33919 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
33920 units = GET_MODE_SIZE (mode);
33921 break;
33923 case MODE_COMPLEX_FLOAT:
33924 if ((TARGET_SSE && mode == TCmode)
33925 || (TARGET_80387 && mode == XCmode)
33926 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
33927 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
33928 units = GET_MODE_SIZE (mode);
33929 break;
33931 case MODE_VECTOR_INT:
33932 case MODE_VECTOR_FLOAT:
33933 if ((TARGET_AVX && VALID_AVX256_REG_MODE (mode))
33934 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
33935 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
33936 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
33937 units = GET_MODE_SIZE (mode);
33940 /* Return the cost of moving between two registers of mode MODE,
33941 assuming that the move will be in pieces of at most UNITS bytes. */
33942 return COSTS_N_INSNS ((GET_MODE_SIZE (mode) + units - 1) / units);
33945 /* Compute a (partial) cost for rtx X. Return true if the complete
33946 cost has been computed, and false if subexpressions should be
33947 scanned. In either case, *TOTAL contains the cost result. */
33949 static bool
33950 ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
33951 bool speed)
33953 enum rtx_code code = (enum rtx_code) code_i;
33954 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
33955 enum machine_mode mode = GET_MODE (x);
33956 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
33958 switch (code)
33960 case SET:
33961 if (register_operand (SET_DEST (x), VOIDmode)
33962 && reg_or_0_operand (SET_SRC (x), VOIDmode))
33964 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
33965 return true;
33967 return false;
33969 case CONST_INT:
33970 case CONST:
33971 case LABEL_REF:
33972 case SYMBOL_REF:
33973 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
33974 *total = 3;
33975 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
33976 *total = 2;
33977 else if (flag_pic && SYMBOLIC_CONST (x)
33978 && (!TARGET_64BIT
33979 || (!GET_CODE (x) != LABEL_REF
33980 && (GET_CODE (x) != SYMBOL_REF
33981 || !SYMBOL_REF_LOCAL_P (x)))))
33982 *total = 1;
33983 else
33984 *total = 0;
33985 return true;
33987 case CONST_DOUBLE:
33988 if (mode == VOIDmode)
33990 *total = 0;
33991 return true;
33993 switch (standard_80387_constant_p (x))
33995 case 1: /* 0.0 */
33996 *total = 1;
33997 return true;
33998 default: /* Other constants */
33999 *total = 2;
34000 return true;
34001 case 0:
34002 case -1:
34003 break;
34005 if (SSE_FLOAT_MODE_P (mode))
34007 case CONST_VECTOR:
34008 switch (standard_sse_constant_p (x))
34010 case 0:
34011 break;
34012 case 1: /* 0: xor eliminates false dependency */
34013 *total = 0;
34014 return true;
34015 default: /* -1: cmp contains false dependency */
34016 *total = 1;
34017 return true;
34020 /* Fall back to (MEM (SYMBOL_REF)), since that's where
34021 it'll probably end up. Add a penalty for size. */
34022 *total = (COSTS_N_INSNS (1)
34023 + (flag_pic != 0 && !TARGET_64BIT)
34024 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
34025 return true;
34027 case ZERO_EXTEND:
34028 /* The zero extensions is often completely free on x86_64, so make
34029 it as cheap as possible. */
34030 if (TARGET_64BIT && mode == DImode
34031 && GET_MODE (XEXP (x, 0)) == SImode)
34032 *total = 1;
34033 else if (TARGET_ZERO_EXTEND_WITH_AND)
34034 *total = cost->add;
34035 else
34036 *total = cost->movzx;
34037 return false;
34039 case SIGN_EXTEND:
34040 *total = cost->movsx;
34041 return false;
34043 case ASHIFT:
34044 if (SCALAR_INT_MODE_P (mode)
34045 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
34046 && CONST_INT_P (XEXP (x, 1)))
34048 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
34049 if (value == 1)
34051 *total = cost->add;
34052 return false;
34054 if ((value == 2 || value == 3)
34055 && cost->lea <= cost->shift_const)
34057 *total = cost->lea;
34058 return false;
34061 /* FALLTHRU */
34063 case ROTATE:
34064 case ASHIFTRT:
34065 case LSHIFTRT:
34066 case ROTATERT:
34067 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
34069 /* ??? Should be SSE vector operation cost. */
34070 /* At least for published AMD latencies, this really is the same
34071 as the latency for a simple fpu operation like fabs. */
34072 /* V*QImode is emulated with 1-11 insns. */
34073 if (mode == V16QImode || mode == V32QImode)
34075 int count = 11;
34076 if (TARGET_XOP && mode == V16QImode)
34078 /* For XOP we use vpshab, which requires a broadcast of the
34079 value to the variable shift insn. For constants this
34080 means a V16Q const in mem; even when we can perform the
34081 shift with one insn set the cost to prefer paddb. */
34082 if (CONSTANT_P (XEXP (x, 1)))
34084 *total = (cost->fabs
34085 + rtx_cost (XEXP (x, 0), code, 0, speed)
34086 + (speed ? 2 : COSTS_N_BYTES (16)));
34087 return true;
34089 count = 3;
34091 else if (TARGET_SSSE3)
34092 count = 7;
34093 *total = cost->fabs * count;
34095 else
34096 *total = cost->fabs;
34098 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34100 if (CONST_INT_P (XEXP (x, 1)))
34102 if (INTVAL (XEXP (x, 1)) > 32)
34103 *total = cost->shift_const + COSTS_N_INSNS (2);
34104 else
34105 *total = cost->shift_const * 2;
34107 else
34109 if (GET_CODE (XEXP (x, 1)) == AND)
34110 *total = cost->shift_var * 2;
34111 else
34112 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
34115 else
34117 if (CONST_INT_P (XEXP (x, 1)))
34118 *total = cost->shift_const;
34119 else
34120 *total = cost->shift_var;
34122 return false;
34124 case FMA:
34126 rtx sub;
34128 gcc_assert (FLOAT_MODE_P (mode));
34129 gcc_assert (TARGET_FMA || TARGET_FMA4);
34131 /* ??? SSE scalar/vector cost should be used here. */
34132 /* ??? Bald assumption that fma has the same cost as fmul. */
34133 *total = cost->fmul;
34134 *total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
34136 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
34137 sub = XEXP (x, 0);
34138 if (GET_CODE (sub) == NEG)
34139 sub = XEXP (sub, 0);
34140 *total += rtx_cost (sub, FMA, 0, speed);
34142 sub = XEXP (x, 2);
34143 if (GET_CODE (sub) == NEG)
34144 sub = XEXP (sub, 0);
34145 *total += rtx_cost (sub, FMA, 2, speed);
34146 return true;
34149 case MULT:
34150 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34152 /* ??? SSE scalar cost should be used here. */
34153 *total = cost->fmul;
34154 return false;
34156 else if (X87_FLOAT_MODE_P (mode))
34158 *total = cost->fmul;
34159 return false;
34161 else if (FLOAT_MODE_P (mode))
34163 /* ??? SSE vector cost should be used here. */
34164 *total = cost->fmul;
34165 return false;
34167 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
34169 /* V*QImode is emulated with 7-13 insns. */
34170 if (mode == V16QImode || mode == V32QImode)
34172 int extra = 11;
34173 if (TARGET_XOP && mode == V16QImode)
34174 extra = 5;
34175 else if (TARGET_SSSE3)
34176 extra = 6;
34177 *total = cost->fmul * 2 + cost->fabs * extra;
34179 /* V*DImode is emulated with 5-8 insns. */
34180 else if (mode == V2DImode || mode == V4DImode)
34182 if (TARGET_XOP && mode == V2DImode)
34183 *total = cost->fmul * 2 + cost->fabs * 3;
34184 else
34185 *total = cost->fmul * 3 + cost->fabs * 5;
34187 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
34188 insns, including two PMULUDQ. */
34189 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
34190 *total = cost->fmul * 2 + cost->fabs * 5;
34191 else
34192 *total = cost->fmul;
34193 return false;
34195 else
34197 rtx op0 = XEXP (x, 0);
34198 rtx op1 = XEXP (x, 1);
34199 int nbits;
34200 if (CONST_INT_P (XEXP (x, 1)))
34202 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
34203 for (nbits = 0; value != 0; value &= value - 1)
34204 nbits++;
34206 else
34207 /* This is arbitrary. */
34208 nbits = 7;
34210 /* Compute costs correctly for widening multiplication. */
34211 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
34212 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
34213 == GET_MODE_SIZE (mode))
34215 int is_mulwiden = 0;
34216 enum machine_mode inner_mode = GET_MODE (op0);
34218 if (GET_CODE (op0) == GET_CODE (op1))
34219 is_mulwiden = 1, op1 = XEXP (op1, 0);
34220 else if (CONST_INT_P (op1))
34222 if (GET_CODE (op0) == SIGN_EXTEND)
34223 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
34224 == INTVAL (op1);
34225 else
34226 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
34229 if (is_mulwiden)
34230 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
34233 *total = (cost->mult_init[MODE_INDEX (mode)]
34234 + nbits * cost->mult_bit
34235 + rtx_cost (op0, outer_code, opno, speed)
34236 + rtx_cost (op1, outer_code, opno, speed));
34238 return true;
34241 case DIV:
34242 case UDIV:
34243 case MOD:
34244 case UMOD:
34245 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34246 /* ??? SSE cost should be used here. */
34247 *total = cost->fdiv;
34248 else if (X87_FLOAT_MODE_P (mode))
34249 *total = cost->fdiv;
34250 else if (FLOAT_MODE_P (mode))
34251 /* ??? SSE vector cost should be used here. */
34252 *total = cost->fdiv;
34253 else
34254 *total = cost->divide[MODE_INDEX (mode)];
34255 return false;
34257 case PLUS:
34258 if (GET_MODE_CLASS (mode) == MODE_INT
34259 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
34261 if (GET_CODE (XEXP (x, 0)) == PLUS
34262 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
34263 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
34264 && CONSTANT_P (XEXP (x, 1)))
34266 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
34267 if (val == 2 || val == 4 || val == 8)
34269 *total = cost->lea;
34270 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
34271 outer_code, opno, speed);
34272 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
34273 outer_code, opno, speed);
34274 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
34275 return true;
34278 else if (GET_CODE (XEXP (x, 0)) == MULT
34279 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
34281 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
34282 if (val == 2 || val == 4 || val == 8)
34284 *total = cost->lea;
34285 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
34286 outer_code, opno, speed);
34287 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
34288 return true;
34291 else if (GET_CODE (XEXP (x, 0)) == PLUS)
34293 *total = cost->lea;
34294 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
34295 outer_code, opno, speed);
34296 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
34297 outer_code, opno, speed);
34298 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
34299 return true;
34302 /* FALLTHRU */
34304 case MINUS:
34305 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34307 /* ??? SSE cost should be used here. */
34308 *total = cost->fadd;
34309 return false;
34311 else if (X87_FLOAT_MODE_P (mode))
34313 *total = cost->fadd;
34314 return false;
34316 else if (FLOAT_MODE_P (mode))
34318 /* ??? SSE vector cost should be used here. */
34319 *total = cost->fadd;
34320 return false;
34322 /* FALLTHRU */
34324 case AND:
34325 case IOR:
34326 case XOR:
34327 if (GET_MODE_CLASS (mode) == MODE_INT
34328 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34330 *total = (cost->add * 2
34331 + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
34332 << (GET_MODE (XEXP (x, 0)) != DImode))
34333 + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
34334 << (GET_MODE (XEXP (x, 1)) != DImode)));
34335 return true;
34337 /* FALLTHRU */
34339 case NEG:
34340 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34342 /* ??? SSE cost should be used here. */
34343 *total = cost->fchs;
34344 return false;
34346 else if (X87_FLOAT_MODE_P (mode))
34348 *total = cost->fchs;
34349 return false;
34351 else if (FLOAT_MODE_P (mode))
34353 /* ??? SSE vector cost should be used here. */
34354 *total = cost->fchs;
34355 return false;
34357 /* FALLTHRU */
34359 case NOT:
34360 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
34362 /* ??? Should be SSE vector operation cost. */
34363 /* At least for published AMD latencies, this really is the same
34364 as the latency for a simple fpu operation like fabs. */
34365 *total = cost->fabs;
34367 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34368 *total = cost->add * 2;
34369 else
34370 *total = cost->add;
34371 return false;
34373 case COMPARE:
34374 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
34375 && XEXP (XEXP (x, 0), 1) == const1_rtx
34376 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
34377 && XEXP (x, 1) == const0_rtx)
34379 /* This kind of construct is implemented using test[bwl].
34380 Treat it as if we had an AND. */
34381 *total = (cost->add
34382 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
34383 + rtx_cost (const1_rtx, outer_code, opno, speed));
34384 return true;
34386 return false;
34388 case FLOAT_EXTEND:
34389 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
34390 *total = 0;
34391 return false;
34393 case ABS:
34394 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34395 /* ??? SSE cost should be used here. */
34396 *total = cost->fabs;
34397 else if (X87_FLOAT_MODE_P (mode))
34398 *total = cost->fabs;
34399 else if (FLOAT_MODE_P (mode))
34400 /* ??? SSE vector cost should be used here. */
34401 *total = cost->fabs;
34402 return false;
34404 case SQRT:
34405 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34406 /* ??? SSE cost should be used here. */
34407 *total = cost->fsqrt;
34408 else if (X87_FLOAT_MODE_P (mode))
34409 *total = cost->fsqrt;
34410 else if (FLOAT_MODE_P (mode))
34411 /* ??? SSE vector cost should be used here. */
34412 *total = cost->fsqrt;
34413 return false;
34415 case UNSPEC:
34416 if (XINT (x, 1) == UNSPEC_TP)
34417 *total = 0;
34418 return false;
34420 case VEC_SELECT:
34421 case VEC_CONCAT:
34422 case VEC_MERGE:
34423 case VEC_DUPLICATE:
34424 /* ??? Assume all of these vector manipulation patterns are
34425 recognizable. In which case they all pretty much have the
34426 same cost. */
34427 *total = cost->fabs;
34428 return true;
34430 default:
34431 return false;
34435 #if TARGET_MACHO
34437 static int current_machopic_label_num;
34439 /* Given a symbol name and its associated stub, write out the
34440 definition of the stub. */
34442 void
34443 machopic_output_stub (FILE *file, const char *symb, const char *stub)
34445 unsigned int length;
34446 char *binder_name, *symbol_name, lazy_ptr_name[32];
34447 int label = ++current_machopic_label_num;
34449 /* For 64-bit we shouldn't get here. */
34450 gcc_assert (!TARGET_64BIT);
34452 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
34453 symb = targetm.strip_name_encoding (symb);
34455 length = strlen (stub);
34456 binder_name = XALLOCAVEC (char, length + 32);
34457 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
34459 length = strlen (symb);
34460 symbol_name = XALLOCAVEC (char, length + 32);
34461 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
34463 sprintf (lazy_ptr_name, "L%d$lz", label);
34465 if (MACHOPIC_ATT_STUB)
34466 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
34467 else if (MACHOPIC_PURE)
34468 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
34469 else
34470 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
34472 fprintf (file, "%s:\n", stub);
34473 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
34475 if (MACHOPIC_ATT_STUB)
34477 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
34479 else if (MACHOPIC_PURE)
34481 /* PIC stub. */
34482 /* 25-byte PIC stub using "CALL get_pc_thunk". */
34483 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
34484 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
34485 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
34486 label, lazy_ptr_name, label);
34487 fprintf (file, "\tjmp\t*%%ecx\n");
34489 else
34490 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
34492 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
34493 it needs no stub-binding-helper. */
34494 if (MACHOPIC_ATT_STUB)
34495 return;
34497 fprintf (file, "%s:\n", binder_name);
34499 if (MACHOPIC_PURE)
34501 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
34502 fprintf (file, "\tpushl\t%%ecx\n");
34504 else
34505 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
34507 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
34509 /* N.B. Keep the correspondence of these
34510 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
34511 old-pic/new-pic/non-pic stubs; altering this will break
34512 compatibility with existing dylibs. */
34513 if (MACHOPIC_PURE)
34515 /* 25-byte PIC stub using "CALL get_pc_thunk". */
34516 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
34518 else
34519 /* 16-byte -mdynamic-no-pic stub. */
34520 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
34522 fprintf (file, "%s:\n", lazy_ptr_name);
34523 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
34524 fprintf (file, ASM_LONG "%s\n", binder_name);
34526 #endif /* TARGET_MACHO */
34528 /* Order the registers for register allocator. */
34530 void
34531 x86_order_regs_for_local_alloc (void)
34533 int pos = 0;
34534 int i;
34536 /* First allocate the local general purpose registers. */
34537 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
34538 if (GENERAL_REGNO_P (i) && call_used_regs[i])
34539 reg_alloc_order [pos++] = i;
34541 /* Global general purpose registers. */
34542 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
34543 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
34544 reg_alloc_order [pos++] = i;
34546 /* x87 registers come first in case we are doing FP math
34547 using them. */
34548 if (!TARGET_SSE_MATH)
34549 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
34550 reg_alloc_order [pos++] = i;
34552 /* SSE registers. */
34553 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
34554 reg_alloc_order [pos++] = i;
34555 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
34556 reg_alloc_order [pos++] = i;
34558 /* x87 registers. */
34559 if (TARGET_SSE_MATH)
34560 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
34561 reg_alloc_order [pos++] = i;
34563 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
34564 reg_alloc_order [pos++] = i;
34566 /* Initialize the rest of array as we do not allocate some registers
34567 at all. */
34568 while (pos < FIRST_PSEUDO_REGISTER)
34569 reg_alloc_order [pos++] = 0;
34572 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
34573 in struct attribute_spec handler. */
34574 static tree
34575 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
34576 tree args,
34577 int flags ATTRIBUTE_UNUSED,
34578 bool *no_add_attrs)
34580 if (TREE_CODE (*node) != FUNCTION_TYPE
34581 && TREE_CODE (*node) != METHOD_TYPE
34582 && TREE_CODE (*node) != FIELD_DECL
34583 && TREE_CODE (*node) != TYPE_DECL)
34585 warning (OPT_Wattributes, "%qE attribute only applies to functions",
34586 name);
34587 *no_add_attrs = true;
34588 return NULL_TREE;
34590 if (TARGET_64BIT)
34592 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
34593 name);
34594 *no_add_attrs = true;
34595 return NULL_TREE;
34597 if (is_attribute_p ("callee_pop_aggregate_return", name))
34599 tree cst;
34601 cst = TREE_VALUE (args);
34602 if (TREE_CODE (cst) != INTEGER_CST)
34604 warning (OPT_Wattributes,
34605 "%qE attribute requires an integer constant argument",
34606 name);
34607 *no_add_attrs = true;
34609 else if (compare_tree_int (cst, 0) != 0
34610 && compare_tree_int (cst, 1) != 0)
34612 warning (OPT_Wattributes,
34613 "argument to %qE attribute is neither zero, nor one",
34614 name);
34615 *no_add_attrs = true;
34618 return NULL_TREE;
34621 return NULL_TREE;
34624 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
34625 struct attribute_spec.handler. */
34626 static tree
34627 ix86_handle_abi_attribute (tree *node, tree name,
34628 tree args ATTRIBUTE_UNUSED,
34629 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
34631 if (TREE_CODE (*node) != FUNCTION_TYPE
34632 && TREE_CODE (*node) != METHOD_TYPE
34633 && TREE_CODE (*node) != FIELD_DECL
34634 && TREE_CODE (*node) != TYPE_DECL)
34636 warning (OPT_Wattributes, "%qE attribute only applies to functions",
34637 name);
34638 *no_add_attrs = true;
34639 return NULL_TREE;
34642 /* Can combine regparm with all attributes but fastcall. */
34643 if (is_attribute_p ("ms_abi", name))
34645 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
34647 error ("ms_abi and sysv_abi attributes are not compatible");
34650 return NULL_TREE;
34652 else if (is_attribute_p ("sysv_abi", name))
34654 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
34656 error ("ms_abi and sysv_abi attributes are not compatible");
34659 return NULL_TREE;
34662 return NULL_TREE;
34665 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
34666 struct attribute_spec.handler. */
34667 static tree
34668 ix86_handle_struct_attribute (tree *node, tree name,
34669 tree args ATTRIBUTE_UNUSED,
34670 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
34672 tree *type = NULL;
34673 if (DECL_P (*node))
34675 if (TREE_CODE (*node) == TYPE_DECL)
34676 type = &TREE_TYPE (*node);
34678 else
34679 type = node;
34681 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
34683 warning (OPT_Wattributes, "%qE attribute ignored",
34684 name);
34685 *no_add_attrs = true;
34688 else if ((is_attribute_p ("ms_struct", name)
34689 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
34690 || ((is_attribute_p ("gcc_struct", name)
34691 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
34693 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
34694 name);
34695 *no_add_attrs = true;
34698 return NULL_TREE;
34701 static tree
34702 ix86_handle_fndecl_attribute (tree *node, tree name,
34703 tree args ATTRIBUTE_UNUSED,
34704 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
34706 if (TREE_CODE (*node) != FUNCTION_DECL)
34708 warning (OPT_Wattributes, "%qE attribute only applies to functions",
34709 name);
34710 *no_add_attrs = true;
34712 return NULL_TREE;
34715 static bool
34716 ix86_ms_bitfield_layout_p (const_tree record_type)
34718 return ((TARGET_MS_BITFIELD_LAYOUT
34719 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
34720 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
34723 /* Returns an expression indicating where the this parameter is
34724 located on entry to the FUNCTION. */
34726 static rtx
34727 x86_this_parameter (tree function)
34729 tree type = TREE_TYPE (function);
34730 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
34731 int nregs;
34733 if (TARGET_64BIT)
34735 const int *parm_regs;
34737 if (ix86_function_type_abi (type) == MS_ABI)
34738 parm_regs = x86_64_ms_abi_int_parameter_registers;
34739 else
34740 parm_regs = x86_64_int_parameter_registers;
34741 return gen_rtx_REG (Pmode, parm_regs[aggr]);
34744 nregs = ix86_function_regparm (type, function);
34746 if (nregs > 0 && !stdarg_p (type))
34748 int regno;
34749 unsigned int ccvt = ix86_get_callcvt (type);
34751 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
34752 regno = aggr ? DX_REG : CX_REG;
34753 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
34755 regno = CX_REG;
34756 if (aggr)
34757 return gen_rtx_MEM (SImode,
34758 plus_constant (Pmode, stack_pointer_rtx, 4));
34760 else
34762 regno = AX_REG;
34763 if (aggr)
34765 regno = DX_REG;
34766 if (nregs == 1)
34767 return gen_rtx_MEM (SImode,
34768 plus_constant (Pmode,
34769 stack_pointer_rtx, 4));
34772 return gen_rtx_REG (SImode, regno);
34775 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
34776 aggr ? 8 : 4));
34779 /* Determine whether x86_output_mi_thunk can succeed. */
34781 static bool
34782 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
34783 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
34784 HOST_WIDE_INT vcall_offset, const_tree function)
34786 /* 64-bit can handle anything. */
34787 if (TARGET_64BIT)
34788 return true;
34790 /* For 32-bit, everything's fine if we have one free register. */
34791 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
34792 return true;
34794 /* Need a free register for vcall_offset. */
34795 if (vcall_offset)
34796 return false;
34798 /* Need a free register for GOT references. */
34799 if (flag_pic && !targetm.binds_local_p (function))
34800 return false;
34802 /* Otherwise ok. */
34803 return true;
34806 /* Output the assembler code for a thunk function. THUNK_DECL is the
34807 declaration for the thunk function itself, FUNCTION is the decl for
34808 the target function. DELTA is an immediate constant offset to be
34809 added to THIS. If VCALL_OFFSET is nonzero, the word at
34810 *(*this + vcall_offset) should be added to THIS. */
34812 static void
34813 x86_output_mi_thunk (FILE *file,
34814 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
34815 HOST_WIDE_INT vcall_offset, tree function)
34817 rtx this_param = x86_this_parameter (function);
34818 rtx this_reg, tmp, fnaddr;
34819 unsigned int tmp_regno;
34821 if (TARGET_64BIT)
34822 tmp_regno = R10_REG;
34823 else
34825 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
34826 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
34827 tmp_regno = AX_REG;
34828 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
34829 tmp_regno = DX_REG;
34830 else
34831 tmp_regno = CX_REG;
34834 emit_note (NOTE_INSN_PROLOGUE_END);
34836 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
34837 pull it in now and let DELTA benefit. */
34838 if (REG_P (this_param))
34839 this_reg = this_param;
34840 else if (vcall_offset)
34842 /* Put the this parameter into %eax. */
34843 this_reg = gen_rtx_REG (Pmode, AX_REG);
34844 emit_move_insn (this_reg, this_param);
34846 else
34847 this_reg = NULL_RTX;
34849 /* Adjust the this parameter by a fixed constant. */
34850 if (delta)
34852 rtx delta_rtx = GEN_INT (delta);
34853 rtx delta_dst = this_reg ? this_reg : this_param;
34855 if (TARGET_64BIT)
34857 if (!x86_64_general_operand (delta_rtx, Pmode))
34859 tmp = gen_rtx_REG (Pmode, tmp_regno);
34860 emit_move_insn (tmp, delta_rtx);
34861 delta_rtx = tmp;
34865 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
34868 /* Adjust the this parameter by a value stored in the vtable. */
34869 if (vcall_offset)
34871 rtx vcall_addr, vcall_mem, this_mem;
34873 tmp = gen_rtx_REG (Pmode, tmp_regno);
34875 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
34876 if (Pmode != ptr_mode)
34877 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
34878 emit_move_insn (tmp, this_mem);
34880 /* Adjust the this parameter. */
34881 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
34882 if (TARGET_64BIT
34883 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
34885 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
34886 emit_move_insn (tmp2, GEN_INT (vcall_offset));
34887 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
34890 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
34891 if (Pmode != ptr_mode)
34892 emit_insn (gen_addsi_1_zext (this_reg,
34893 gen_rtx_REG (ptr_mode,
34894 REGNO (this_reg)),
34895 vcall_mem));
34896 else
34897 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
34900 /* If necessary, drop THIS back to its stack slot. */
34901 if (this_reg && this_reg != this_param)
34902 emit_move_insn (this_param, this_reg);
34904 fnaddr = XEXP (DECL_RTL (function), 0);
34905 if (TARGET_64BIT)
34907 if (!flag_pic || targetm.binds_local_p (function)
34908 || cfun->machine->call_abi == MS_ABI)
34910 else
34912 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
34913 tmp = gen_rtx_CONST (Pmode, tmp);
34914 fnaddr = gen_rtx_MEM (Pmode, tmp);
34917 else
34919 if (!flag_pic || targetm.binds_local_p (function))
34921 #if TARGET_MACHO
34922 else if (TARGET_MACHO)
34924 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
34925 fnaddr = XEXP (fnaddr, 0);
34927 #endif /* TARGET_MACHO */
34928 else
34930 tmp = gen_rtx_REG (Pmode, CX_REG);
34931 output_set_got (tmp, NULL_RTX);
34933 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
34934 fnaddr = gen_rtx_PLUS (Pmode, fnaddr, tmp);
34935 fnaddr = gen_rtx_MEM (Pmode, fnaddr);
34939 /* Our sibling call patterns do not allow memories, because we have no
34940 predicate that can distinguish between frame and non-frame memory.
34941 For our purposes here, we can get away with (ab)using a jump pattern,
34942 because we're going to do no optimization. */
34943 if (MEM_P (fnaddr))
34944 emit_jump_insn (gen_indirect_jump (fnaddr));
34945 else
34947 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
34948 fnaddr = legitimize_pic_address (fnaddr,
34949 gen_rtx_REG (Pmode, tmp_regno));
34951 if (!sibcall_insn_operand (fnaddr, word_mode))
34953 tmp = gen_rtx_REG (word_mode, tmp_regno);
34954 if (GET_MODE (fnaddr) != word_mode)
34955 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
34956 emit_move_insn (tmp, fnaddr);
34957 fnaddr = tmp;
34960 tmp = gen_rtx_MEM (QImode, fnaddr);
34961 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
34962 tmp = emit_call_insn (tmp);
34963 SIBLING_CALL_P (tmp) = 1;
34965 emit_barrier ();
34967 /* Emit just enough of rest_of_compilation to get the insns emitted.
34968 Note that use_thunk calls assemble_start_function et al. */
34969 tmp = get_insns ();
34970 shorten_branches (tmp);
34971 final_start_function (tmp, file, 1);
34972 final (tmp, file, 1);
34973 final_end_function ();
34976 static void
34977 x86_file_start (void)
34979 default_file_start ();
34980 #if TARGET_MACHO
34981 darwin_file_start ();
34982 #endif
34983 if (X86_FILE_START_VERSION_DIRECTIVE)
34984 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
34985 if (X86_FILE_START_FLTUSED)
34986 fputs ("\t.global\t__fltused\n", asm_out_file);
34987 if (ix86_asm_dialect == ASM_INTEL)
34988 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
34992 x86_field_alignment (tree field, int computed)
34994 enum machine_mode mode;
34995 tree type = TREE_TYPE (field);
34997 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
34998 return computed;
34999 mode = TYPE_MODE (strip_array_types (type));
35000 if (mode == DFmode || mode == DCmode
35001 || GET_MODE_CLASS (mode) == MODE_INT
35002 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
35003 return MIN (32, computed);
35004 return computed;
35007 /* Output assembler code to FILE to increment profiler label # LABELNO
35008 for profiling a function entry. */
35009 void
35010 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
35012 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
35013 : MCOUNT_NAME);
35015 if (TARGET_64BIT)
35017 #ifndef NO_PROFILE_COUNTERS
35018 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
35019 #endif
35021 if (DEFAULT_ABI == SYSV_ABI && flag_pic)
35022 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
35023 else
35024 fprintf (file, "\tcall\t%s\n", mcount_name);
35026 else if (flag_pic)
35028 #ifndef NO_PROFILE_COUNTERS
35029 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
35030 LPREFIX, labelno);
35031 #endif
35032 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
35034 else
35036 #ifndef NO_PROFILE_COUNTERS
35037 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
35038 LPREFIX, labelno);
35039 #endif
35040 fprintf (file, "\tcall\t%s\n", mcount_name);
35044 /* We don't have exact information about the insn sizes, but we may assume
35045 quite safely that we are informed about all 1 byte insns and memory
35046 address sizes. This is enough to eliminate unnecessary padding in
35047 99% of cases. */
35049 static int
35050 min_insn_size (rtx insn)
35052 int l = 0, len;
35054 if (!INSN_P (insn) || !active_insn_p (insn))
35055 return 0;
35057 /* Discard alignments we've emit and jump instructions. */
35058 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
35059 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
35060 return 0;
35061 if (JUMP_TABLE_DATA_P (insn))
35062 return 0;
35064 /* Important case - calls are always 5 bytes.
35065 It is common to have many calls in the row. */
35066 if (CALL_P (insn)
35067 && symbolic_reference_mentioned_p (PATTERN (insn))
35068 && !SIBLING_CALL_P (insn))
35069 return 5;
35070 len = get_attr_length (insn);
35071 if (len <= 1)
35072 return 1;
35074 /* For normal instructions we rely on get_attr_length being exact,
35075 with a few exceptions. */
35076 if (!JUMP_P (insn))
35078 enum attr_type type = get_attr_type (insn);
35080 switch (type)
35082 case TYPE_MULTI:
35083 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
35084 || asm_noperands (PATTERN (insn)) >= 0)
35085 return 0;
35086 break;
35087 case TYPE_OTHER:
35088 case TYPE_FCMP:
35089 break;
35090 default:
35091 /* Otherwise trust get_attr_length. */
35092 return len;
35095 l = get_attr_length_address (insn);
35096 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
35097 l = 4;
35099 if (l)
35100 return 1+l;
35101 else
35102 return 2;
35105 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
35107 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
35108 window. */
35110 static void
35111 ix86_avoid_jump_mispredicts (void)
35113 rtx insn, start = get_insns ();
35114 int nbytes = 0, njumps = 0;
35115 int isjump = 0;
35117 /* Look for all minimal intervals of instructions containing 4 jumps.
35118 The intervals are bounded by START and INSN. NBYTES is the total
35119 size of instructions in the interval including INSN and not including
35120 START. When the NBYTES is smaller than 16 bytes, it is possible
35121 that the end of START and INSN ends up in the same 16byte page.
35123 The smallest offset in the page INSN can start is the case where START
35124 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
35125 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
35127 for (insn = start; insn; insn = NEXT_INSN (insn))
35129 int min_size;
35131 if (LABEL_P (insn))
35133 int align = label_to_alignment (insn);
35134 int max_skip = label_to_max_skip (insn);
35136 if (max_skip > 15)
35137 max_skip = 15;
35138 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
35139 already in the current 16 byte page, because otherwise
35140 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
35141 bytes to reach 16 byte boundary. */
35142 if (align <= 0
35143 || (align <= 3 && max_skip != (1 << align) - 1))
35144 max_skip = 0;
35145 if (dump_file)
35146 fprintf (dump_file, "Label %i with max_skip %i\n",
35147 INSN_UID (insn), max_skip);
35148 if (max_skip)
35150 while (nbytes + max_skip >= 16)
35152 start = NEXT_INSN (start);
35153 if ((JUMP_P (start)
35154 && GET_CODE (PATTERN (start)) != ADDR_VEC
35155 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
35156 || CALL_P (start))
35157 njumps--, isjump = 1;
35158 else
35159 isjump = 0;
35160 nbytes -= min_insn_size (start);
35163 continue;
35166 min_size = min_insn_size (insn);
35167 nbytes += min_size;
35168 if (dump_file)
35169 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
35170 INSN_UID (insn), min_size);
35171 if ((JUMP_P (insn)
35172 && GET_CODE (PATTERN (insn)) != ADDR_VEC
35173 && GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC)
35174 || CALL_P (insn))
35175 njumps++;
35176 else
35177 continue;
35179 while (njumps > 3)
35181 start = NEXT_INSN (start);
35182 if ((JUMP_P (start)
35183 && GET_CODE (PATTERN (start)) != ADDR_VEC
35184 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
35185 || CALL_P (start))
35186 njumps--, isjump = 1;
35187 else
35188 isjump = 0;
35189 nbytes -= min_insn_size (start);
35191 gcc_assert (njumps >= 0);
35192 if (dump_file)
35193 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
35194 INSN_UID (start), INSN_UID (insn), nbytes);
35196 if (njumps == 3 && isjump && nbytes < 16)
35198 int padsize = 15 - nbytes + min_insn_size (insn);
35200 if (dump_file)
35201 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
35202 INSN_UID (insn), padsize);
35203 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
35207 #endif
35209 /* AMD Athlon works faster
35210 when RET is not destination of conditional jump or directly preceded
35211 by other jump instruction. We avoid the penalty by inserting NOP just
35212 before the RET instructions in such cases. */
35213 static void
35214 ix86_pad_returns (void)
35216 edge e;
35217 edge_iterator ei;
35219 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
35221 basic_block bb = e->src;
35222 rtx ret = BB_END (bb);
35223 rtx prev;
35224 bool replace = false;
35226 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
35227 || optimize_bb_for_size_p (bb))
35228 continue;
35229 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
35230 if (active_insn_p (prev) || LABEL_P (prev))
35231 break;
35232 if (prev && LABEL_P (prev))
35234 edge e;
35235 edge_iterator ei;
35237 FOR_EACH_EDGE (e, ei, bb->preds)
35238 if (EDGE_FREQUENCY (e) && e->src->index >= 0
35239 && !(e->flags & EDGE_FALLTHRU))
35240 replace = true;
35242 if (!replace)
35244 prev = prev_active_insn (ret);
35245 if (prev
35246 && ((JUMP_P (prev) && any_condjump_p (prev))
35247 || CALL_P (prev)))
35248 replace = true;
35249 /* Empty functions get branch mispredict even when
35250 the jump destination is not visible to us. */
35251 if (!prev && !optimize_function_for_size_p (cfun))
35252 replace = true;
35254 if (replace)
35256 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
35257 delete_insn (ret);
35262 /* Count the minimum number of instructions in BB. Return 4 if the
35263 number of instructions >= 4. */
35265 static int
35266 ix86_count_insn_bb (basic_block bb)
35268 rtx insn;
35269 int insn_count = 0;
35271 /* Count number of instructions in this block. Return 4 if the number
35272 of instructions >= 4. */
35273 FOR_BB_INSNS (bb, insn)
35275 /* Only happen in exit blocks. */
35276 if (JUMP_P (insn)
35277 && ANY_RETURN_P (PATTERN (insn)))
35278 break;
35280 if (NONDEBUG_INSN_P (insn)
35281 && GET_CODE (PATTERN (insn)) != USE
35282 && GET_CODE (PATTERN (insn)) != CLOBBER)
35284 insn_count++;
35285 if (insn_count >= 4)
35286 return insn_count;
35290 return insn_count;
35294 /* Count the minimum number of instructions in code path in BB.
35295 Return 4 if the number of instructions >= 4. */
35297 static int
35298 ix86_count_insn (basic_block bb)
35300 edge e;
35301 edge_iterator ei;
35302 int min_prev_count;
35304 /* Only bother counting instructions along paths with no
35305 more than 2 basic blocks between entry and exit. Given
35306 that BB has an edge to exit, determine if a predecessor
35307 of BB has an edge from entry. If so, compute the number
35308 of instructions in the predecessor block. If there
35309 happen to be multiple such blocks, compute the minimum. */
35310 min_prev_count = 4;
35311 FOR_EACH_EDGE (e, ei, bb->preds)
35313 edge prev_e;
35314 edge_iterator prev_ei;
35316 if (e->src == ENTRY_BLOCK_PTR)
35318 min_prev_count = 0;
35319 break;
35321 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
35323 if (prev_e->src == ENTRY_BLOCK_PTR)
35325 int count = ix86_count_insn_bb (e->src);
35326 if (count < min_prev_count)
35327 min_prev_count = count;
35328 break;
35333 if (min_prev_count < 4)
35334 min_prev_count += ix86_count_insn_bb (bb);
35336 return min_prev_count;
35339 /* Pad short function to 4 instructions. */
35341 static void
35342 ix86_pad_short_function (void)
35344 edge e;
35345 edge_iterator ei;
35347 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
35349 rtx ret = BB_END (e->src);
35350 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
35352 int insn_count = ix86_count_insn (e->src);
35354 /* Pad short function. */
35355 if (insn_count < 4)
35357 rtx insn = ret;
35359 /* Find epilogue. */
35360 while (insn
35361 && (!NOTE_P (insn)
35362 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
35363 insn = PREV_INSN (insn);
35365 if (!insn)
35366 insn = ret;
35368 /* Two NOPs count as one instruction. */
35369 insn_count = 2 * (4 - insn_count);
35370 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
35376 /* Implement machine specific optimizations. We implement padding of returns
35377 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
35378 static void
35379 ix86_reorg (void)
35381 /* We are freeing block_for_insn in the toplev to keep compatibility
35382 with old MDEP_REORGS that are not CFG based. Recompute it now. */
35383 compute_bb_for_insn ();
35385 if (optimize && optimize_function_for_speed_p (cfun))
35387 if (TARGET_PAD_SHORT_FUNCTION)
35388 ix86_pad_short_function ();
35389 else if (TARGET_PAD_RETURNS)
35390 ix86_pad_returns ();
35391 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
35392 if (TARGET_FOUR_JUMP_LIMIT)
35393 ix86_avoid_jump_mispredicts ();
35394 #endif
35398 /* Return nonzero when QImode register that must be represented via REX prefix
35399 is used. */
35400 bool
35401 x86_extended_QIreg_mentioned_p (rtx insn)
35403 int i;
35404 extract_insn_cached (insn);
35405 for (i = 0; i < recog_data.n_operands; i++)
35406 if (GENERAL_REG_P (recog_data.operand[i])
35407 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
35408 return true;
35409 return false;
35412 /* Return nonzero when P points to register encoded via REX prefix.
35413 Called via for_each_rtx. */
35414 static int
35415 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
35417 unsigned int regno;
35418 if (!REG_P (*p))
35419 return 0;
35420 regno = REGNO (*p);
35421 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
35424 /* Return true when INSN mentions register that must be encoded using REX
35425 prefix. */
35426 bool
35427 x86_extended_reg_mentioned_p (rtx insn)
35429 return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
35430 extended_reg_mentioned_1, NULL);
35433 /* If profitable, negate (without causing overflow) integer constant
35434 of mode MODE at location LOC. Return true in this case. */
35435 bool
35436 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
35438 HOST_WIDE_INT val;
35440 if (!CONST_INT_P (*loc))
35441 return false;
35443 switch (mode)
35445 case DImode:
35446 /* DImode x86_64 constants must fit in 32 bits. */
35447 gcc_assert (x86_64_immediate_operand (*loc, mode));
35449 mode = SImode;
35450 break;
35452 case SImode:
35453 case HImode:
35454 case QImode:
35455 break;
35457 default:
35458 gcc_unreachable ();
35461 /* Avoid overflows. */
35462 if (mode_signbit_p (mode, *loc))
35463 return false;
35465 val = INTVAL (*loc);
35467 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
35468 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
35469 if ((val < 0 && val != -128)
35470 || val == 128)
35472 *loc = GEN_INT (-val);
35473 return true;
35476 return false;
35479 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
35480 optabs would emit if we didn't have TFmode patterns. */
35482 void
35483 x86_emit_floatuns (rtx operands[2])
35485 rtx neglab, donelab, i0, i1, f0, in, out;
35486 enum machine_mode mode, inmode;
35488 inmode = GET_MODE (operands[1]);
35489 gcc_assert (inmode == SImode || inmode == DImode);
35491 out = operands[0];
35492 in = force_reg (inmode, operands[1]);
35493 mode = GET_MODE (out);
35494 neglab = gen_label_rtx ();
35495 donelab = gen_label_rtx ();
35496 f0 = gen_reg_rtx (mode);
35498 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
35500 expand_float (out, in, 0);
35502 emit_jump_insn (gen_jump (donelab));
35503 emit_barrier ();
35505 emit_label (neglab);
35507 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
35508 1, OPTAB_DIRECT);
35509 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
35510 1, OPTAB_DIRECT);
35511 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
35513 expand_float (f0, i0, 0);
35515 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
35517 emit_label (donelab);
35520 /* AVX2 does support 32-byte integer vector operations,
35521 thus the longest vector we are faced with is V32QImode. */
35522 #define MAX_VECT_LEN 32
35524 struct expand_vec_perm_d
35526 rtx target, op0, op1;
35527 unsigned char perm[MAX_VECT_LEN];
35528 enum machine_mode vmode;
35529 unsigned char nelt;
35530 bool one_operand_p;
35531 bool testing_p;
35534 static bool canonicalize_perm (struct expand_vec_perm_d *d);
35535 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
35536 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
35538 /* Get a vector mode of the same size as the original but with elements
35539 twice as wide. This is only guaranteed to apply to integral vectors. */
35541 static inline enum machine_mode
35542 get_mode_wider_vector (enum machine_mode o)
35544 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
35545 enum machine_mode n = GET_MODE_WIDER_MODE (o);
35546 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
35547 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
35548 return n;
35551 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
35552 with all elements equal to VAR. Return true if successful. */
35554 static bool
35555 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
35556 rtx target, rtx val)
35558 bool ok;
35560 switch (mode)
35562 case V2SImode:
35563 case V2SFmode:
35564 if (!mmx_ok)
35565 return false;
35566 /* FALLTHRU */
35568 case V4DFmode:
35569 case V4DImode:
35570 case V8SFmode:
35571 case V8SImode:
35572 case V2DFmode:
35573 case V2DImode:
35574 case V4SFmode:
35575 case V4SImode:
35577 rtx insn, dup;
35579 /* First attempt to recognize VAL as-is. */
35580 dup = gen_rtx_VEC_DUPLICATE (mode, val);
35581 insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
35582 if (recog_memoized (insn) < 0)
35584 rtx seq;
35585 /* If that fails, force VAL into a register. */
35587 start_sequence ();
35588 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
35589 seq = get_insns ();
35590 end_sequence ();
35591 if (seq)
35592 emit_insn_before (seq, insn);
35594 ok = recog_memoized (insn) >= 0;
35595 gcc_assert (ok);
35598 return true;
35600 case V4HImode:
35601 if (!mmx_ok)
35602 return false;
35603 if (TARGET_SSE || TARGET_3DNOW_A)
35605 rtx x;
35607 val = gen_lowpart (SImode, val);
35608 x = gen_rtx_TRUNCATE (HImode, val);
35609 x = gen_rtx_VEC_DUPLICATE (mode, x);
35610 emit_insn (gen_rtx_SET (VOIDmode, target, x));
35611 return true;
35613 goto widen;
35615 case V8QImode:
35616 if (!mmx_ok)
35617 return false;
35618 goto widen;
35620 case V8HImode:
35621 if (TARGET_SSE2)
35623 struct expand_vec_perm_d dperm;
35624 rtx tmp1, tmp2;
35626 permute:
35627 memset (&dperm, 0, sizeof (dperm));
35628 dperm.target = target;
35629 dperm.vmode = mode;
35630 dperm.nelt = GET_MODE_NUNITS (mode);
35631 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
35632 dperm.one_operand_p = true;
35634 /* Extend to SImode using a paradoxical SUBREG. */
35635 tmp1 = gen_reg_rtx (SImode);
35636 emit_move_insn (tmp1, gen_lowpart (SImode, val));
35638 /* Insert the SImode value as low element of a V4SImode vector. */
35639 tmp2 = gen_lowpart (V4SImode, dperm.op0);
35640 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
35642 ok = (expand_vec_perm_1 (&dperm)
35643 || expand_vec_perm_broadcast_1 (&dperm));
35644 gcc_assert (ok);
35645 return ok;
35647 goto widen;
35649 case V16QImode:
35650 if (TARGET_SSE2)
35651 goto permute;
35652 goto widen;
35654 widen:
35655 /* Replicate the value once into the next wider mode and recurse. */
35657 enum machine_mode smode, wsmode, wvmode;
35658 rtx x;
35660 smode = GET_MODE_INNER (mode);
35661 wvmode = get_mode_wider_vector (mode);
35662 wsmode = GET_MODE_INNER (wvmode);
35664 val = convert_modes (wsmode, smode, val, true);
35665 x = expand_simple_binop (wsmode, ASHIFT, val,
35666 GEN_INT (GET_MODE_BITSIZE (smode)),
35667 NULL_RTX, 1, OPTAB_LIB_WIDEN);
35668 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
35670 x = gen_lowpart (wvmode, target);
35671 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
35672 gcc_assert (ok);
35673 return ok;
35676 case V16HImode:
35677 case V32QImode:
35679 enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
35680 rtx x = gen_reg_rtx (hvmode);
35682 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
35683 gcc_assert (ok);
35685 x = gen_rtx_VEC_CONCAT (mode, x, x);
35686 emit_insn (gen_rtx_SET (VOIDmode, target, x));
35688 return true;
35690 default:
35691 return false;
35695 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
35696 whose ONE_VAR element is VAR, and other elements are zero. Return true
35697 if successful. */
35699 static bool
35700 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
35701 rtx target, rtx var, int one_var)
35703 enum machine_mode vsimode;
35704 rtx new_target;
35705 rtx x, tmp;
35706 bool use_vector_set = false;
35708 switch (mode)
35710 case V2DImode:
35711 /* For SSE4.1, we normally use vector set. But if the second
35712 element is zero and inter-unit moves are OK, we use movq
35713 instead. */
35714 use_vector_set = (TARGET_64BIT
35715 && TARGET_SSE4_1
35716 && !(TARGET_INTER_UNIT_MOVES
35717 && one_var == 0));
35718 break;
35719 case V16QImode:
35720 case V4SImode:
35721 case V4SFmode:
35722 use_vector_set = TARGET_SSE4_1;
35723 break;
35724 case V8HImode:
35725 use_vector_set = TARGET_SSE2;
35726 break;
35727 case V4HImode:
35728 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
35729 break;
35730 case V32QImode:
35731 case V16HImode:
35732 case V8SImode:
35733 case V8SFmode:
35734 case V4DFmode:
35735 use_vector_set = TARGET_AVX;
35736 break;
35737 case V4DImode:
35738 /* Use ix86_expand_vector_set in 64bit mode only. */
35739 use_vector_set = TARGET_AVX && TARGET_64BIT;
35740 break;
35741 default:
35742 break;
35745 if (use_vector_set)
35747 emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
35748 var = force_reg (GET_MODE_INNER (mode), var);
35749 ix86_expand_vector_set (mmx_ok, target, var, one_var);
35750 return true;
35753 switch (mode)
35755 case V2SFmode:
35756 case V2SImode:
35757 if (!mmx_ok)
35758 return false;
35759 /* FALLTHRU */
35761 case V2DFmode:
35762 case V2DImode:
35763 if (one_var != 0)
35764 return false;
35765 var = force_reg (GET_MODE_INNER (mode), var);
35766 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
35767 emit_insn (gen_rtx_SET (VOIDmode, target, x));
35768 return true;
35770 case V4SFmode:
35771 case V4SImode:
35772 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
35773 new_target = gen_reg_rtx (mode);
35774 else
35775 new_target = target;
35776 var = force_reg (GET_MODE_INNER (mode), var);
35777 x = gen_rtx_VEC_DUPLICATE (mode, var);
35778 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
35779 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
35780 if (one_var != 0)
35782 /* We need to shuffle the value to the correct position, so
35783 create a new pseudo to store the intermediate result. */
35785 /* With SSE2, we can use the integer shuffle insns. */
35786 if (mode != V4SFmode && TARGET_SSE2)
35788 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
35789 const1_rtx,
35790 GEN_INT (one_var == 1 ? 0 : 1),
35791 GEN_INT (one_var == 2 ? 0 : 1),
35792 GEN_INT (one_var == 3 ? 0 : 1)));
35793 if (target != new_target)
35794 emit_move_insn (target, new_target);
35795 return true;
35798 /* Otherwise convert the intermediate result to V4SFmode and
35799 use the SSE1 shuffle instructions. */
35800 if (mode != V4SFmode)
35802 tmp = gen_reg_rtx (V4SFmode);
35803 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
35805 else
35806 tmp = new_target;
35808 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
35809 const1_rtx,
35810 GEN_INT (one_var == 1 ? 0 : 1),
35811 GEN_INT (one_var == 2 ? 0+4 : 1+4),
35812 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
35814 if (mode != V4SFmode)
35815 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
35816 else if (tmp != target)
35817 emit_move_insn (target, tmp);
35819 else if (target != new_target)
35820 emit_move_insn (target, new_target);
35821 return true;
35823 case V8HImode:
35824 case V16QImode:
35825 vsimode = V4SImode;
35826 goto widen;
35827 case V4HImode:
35828 case V8QImode:
35829 if (!mmx_ok)
35830 return false;
35831 vsimode = V2SImode;
35832 goto widen;
35833 widen:
35834 if (one_var != 0)
35835 return false;
35837 /* Zero extend the variable element to SImode and recurse. */
35838 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
35840 x = gen_reg_rtx (vsimode);
35841 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
35842 var, one_var))
35843 gcc_unreachable ();
35845 emit_move_insn (target, gen_lowpart (mode, x));
35846 return true;
35848 default:
35849 return false;
35853 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
35854 consisting of the values in VALS. It is known that all elements
35855 except ONE_VAR are constants. Return true if successful. */
35857 static bool
35858 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
35859 rtx target, rtx vals, int one_var)
35861 rtx var = XVECEXP (vals, 0, one_var);
35862 enum machine_mode wmode;
35863 rtx const_vec, x;
35865 const_vec = copy_rtx (vals);
35866 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
35867 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
35869 switch (mode)
35871 case V2DFmode:
35872 case V2DImode:
35873 case V2SFmode:
35874 case V2SImode:
35875 /* For the two element vectors, it's just as easy to use
35876 the general case. */
35877 return false;
35879 case V4DImode:
35880 /* Use ix86_expand_vector_set in 64bit mode only. */
35881 if (!TARGET_64BIT)
35882 return false;
35883 case V4DFmode:
35884 case V8SFmode:
35885 case V8SImode:
35886 case V16HImode:
35887 case V32QImode:
35888 case V4SFmode:
35889 case V4SImode:
35890 case V8HImode:
35891 case V4HImode:
35892 break;
35894 case V16QImode:
35895 if (TARGET_SSE4_1)
35896 break;
35897 wmode = V8HImode;
35898 goto widen;
35899 case V8QImode:
35900 wmode = V4HImode;
35901 goto widen;
35902 widen:
35903 /* There's no way to set one QImode entry easily. Combine
35904 the variable value with its adjacent constant value, and
35905 promote to an HImode set. */
35906 x = XVECEXP (vals, 0, one_var ^ 1);
35907 if (one_var & 1)
35909 var = convert_modes (HImode, QImode, var, true);
35910 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
35911 NULL_RTX, 1, OPTAB_LIB_WIDEN);
35912 x = GEN_INT (INTVAL (x) & 0xff);
35914 else
35916 var = convert_modes (HImode, QImode, var, true);
35917 x = gen_int_mode (INTVAL (x) << 8, HImode);
35919 if (x != const0_rtx)
35920 var = expand_simple_binop (HImode, IOR, var, x, var,
35921 1, OPTAB_LIB_WIDEN);
35923 x = gen_reg_rtx (wmode);
35924 emit_move_insn (x, gen_lowpart (wmode, const_vec));
35925 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
35927 emit_move_insn (target, gen_lowpart (mode, x));
35928 return true;
35930 default:
35931 return false;
35934 emit_move_insn (target, const_vec);
35935 ix86_expand_vector_set (mmx_ok, target, var, one_var);
35936 return true;
35939 /* A subroutine of ix86_expand_vector_init_general. Use vector
35940 concatenate to handle the most general case: all values variable,
35941 and none identical. */
35943 static void
35944 ix86_expand_vector_init_concat (enum machine_mode mode,
35945 rtx target, rtx *ops, int n)
35947 enum machine_mode cmode, hmode = VOIDmode;
35948 rtx first[8], second[4];
35949 rtvec v;
35950 int i, j;
35952 switch (n)
35954 case 2:
35955 switch (mode)
35957 case V8SImode:
35958 cmode = V4SImode;
35959 break;
35960 case V8SFmode:
35961 cmode = V4SFmode;
35962 break;
35963 case V4DImode:
35964 cmode = V2DImode;
35965 break;
35966 case V4DFmode:
35967 cmode = V2DFmode;
35968 break;
35969 case V4SImode:
35970 cmode = V2SImode;
35971 break;
35972 case V4SFmode:
35973 cmode = V2SFmode;
35974 break;
35975 case V2DImode:
35976 cmode = DImode;
35977 break;
35978 case V2SImode:
35979 cmode = SImode;
35980 break;
35981 case V2DFmode:
35982 cmode = DFmode;
35983 break;
35984 case V2SFmode:
35985 cmode = SFmode;
35986 break;
35987 default:
35988 gcc_unreachable ();
35991 if (!register_operand (ops[1], cmode))
35992 ops[1] = force_reg (cmode, ops[1]);
35993 if (!register_operand (ops[0], cmode))
35994 ops[0] = force_reg (cmode, ops[0]);
35995 emit_insn (gen_rtx_SET (VOIDmode, target,
35996 gen_rtx_VEC_CONCAT (mode, ops[0],
35997 ops[1])));
35998 break;
36000 case 4:
36001 switch (mode)
36003 case V4DImode:
36004 cmode = V2DImode;
36005 break;
36006 case V4DFmode:
36007 cmode = V2DFmode;
36008 break;
36009 case V4SImode:
36010 cmode = V2SImode;
36011 break;
36012 case V4SFmode:
36013 cmode = V2SFmode;
36014 break;
36015 default:
36016 gcc_unreachable ();
36018 goto half;
36020 case 8:
36021 switch (mode)
36023 case V8SImode:
36024 cmode = V2SImode;
36025 hmode = V4SImode;
36026 break;
36027 case V8SFmode:
36028 cmode = V2SFmode;
36029 hmode = V4SFmode;
36030 break;
36031 default:
36032 gcc_unreachable ();
36034 goto half;
36036 half:
36037 /* FIXME: We process inputs backward to help RA. PR 36222. */
36038 i = n - 1;
36039 j = (n >> 1) - 1;
36040 for (; i > 0; i -= 2, j--)
36042 first[j] = gen_reg_rtx (cmode);
36043 v = gen_rtvec (2, ops[i - 1], ops[i]);
36044 ix86_expand_vector_init (false, first[j],
36045 gen_rtx_PARALLEL (cmode, v));
36048 n >>= 1;
36049 if (n > 2)
36051 gcc_assert (hmode != VOIDmode);
36052 for (i = j = 0; i < n; i += 2, j++)
36054 second[j] = gen_reg_rtx (hmode);
36055 ix86_expand_vector_init_concat (hmode, second [j],
36056 &first [i], 2);
36058 n >>= 1;
36059 ix86_expand_vector_init_concat (mode, target, second, n);
36061 else
36062 ix86_expand_vector_init_concat (mode, target, first, n);
36063 break;
36065 default:
36066 gcc_unreachable ();
36070 /* A subroutine of ix86_expand_vector_init_general. Use vector
36071 interleave to handle the most general case: all values variable,
36072 and none identical. */
36074 static void
36075 ix86_expand_vector_init_interleave (enum machine_mode mode,
36076 rtx target, rtx *ops, int n)
36078 enum machine_mode first_imode, second_imode, third_imode, inner_mode;
36079 int i, j;
36080 rtx op0, op1;
36081 rtx (*gen_load_even) (rtx, rtx, rtx);
36082 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
36083 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
36085 switch (mode)
36087 case V8HImode:
36088 gen_load_even = gen_vec_setv8hi;
36089 gen_interleave_first_low = gen_vec_interleave_lowv4si;
36090 gen_interleave_second_low = gen_vec_interleave_lowv2di;
36091 inner_mode = HImode;
36092 first_imode = V4SImode;
36093 second_imode = V2DImode;
36094 third_imode = VOIDmode;
36095 break;
36096 case V16QImode:
36097 gen_load_even = gen_vec_setv16qi;
36098 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
36099 gen_interleave_second_low = gen_vec_interleave_lowv4si;
36100 inner_mode = QImode;
36101 first_imode = V8HImode;
36102 second_imode = V4SImode;
36103 third_imode = V2DImode;
36104 break;
36105 default:
36106 gcc_unreachable ();
36109 for (i = 0; i < n; i++)
36111 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
36112 op0 = gen_reg_rtx (SImode);
36113 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
36115 /* Insert the SImode value as low element of V4SImode vector. */
36116 op1 = gen_reg_rtx (V4SImode);
36117 op0 = gen_rtx_VEC_MERGE (V4SImode,
36118 gen_rtx_VEC_DUPLICATE (V4SImode,
36119 op0),
36120 CONST0_RTX (V4SImode),
36121 const1_rtx);
36122 emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
36124 /* Cast the V4SImode vector back to a vector in orignal mode. */
36125 op0 = gen_reg_rtx (mode);
36126 emit_move_insn (op0, gen_lowpart (mode, op1));
36128 /* Load even elements into the second positon. */
36129 emit_insn (gen_load_even (op0,
36130 force_reg (inner_mode,
36131 ops [i + i + 1]),
36132 const1_rtx));
36134 /* Cast vector to FIRST_IMODE vector. */
36135 ops[i] = gen_reg_rtx (first_imode);
36136 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
36139 /* Interleave low FIRST_IMODE vectors. */
36140 for (i = j = 0; i < n; i += 2, j++)
36142 op0 = gen_reg_rtx (first_imode);
36143 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
36145 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
36146 ops[j] = gen_reg_rtx (second_imode);
36147 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
36150 /* Interleave low SECOND_IMODE vectors. */
36151 switch (second_imode)
36153 case V4SImode:
36154 for (i = j = 0; i < n / 2; i += 2, j++)
36156 op0 = gen_reg_rtx (second_imode);
36157 emit_insn (gen_interleave_second_low (op0, ops[i],
36158 ops[i + 1]));
36160 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
36161 vector. */
36162 ops[j] = gen_reg_rtx (third_imode);
36163 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
36165 second_imode = V2DImode;
36166 gen_interleave_second_low = gen_vec_interleave_lowv2di;
36167 /* FALLTHRU */
36169 case V2DImode:
36170 op0 = gen_reg_rtx (second_imode);
36171 emit_insn (gen_interleave_second_low (op0, ops[0],
36172 ops[1]));
36174 /* Cast the SECOND_IMODE vector back to a vector on original
36175 mode. */
36176 emit_insn (gen_rtx_SET (VOIDmode, target,
36177 gen_lowpart (mode, op0)));
36178 break;
36180 default:
36181 gcc_unreachable ();
36185 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
36186 all values variable, and none identical. */
36188 static void
36189 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
36190 rtx target, rtx vals)
36192 rtx ops[32], op0, op1;
36193 enum machine_mode half_mode = VOIDmode;
36194 int n, i;
36196 switch (mode)
36198 case V2SFmode:
36199 case V2SImode:
36200 if (!mmx_ok && !TARGET_SSE)
36201 break;
36202 /* FALLTHRU */
36204 case V8SFmode:
36205 case V8SImode:
36206 case V4DFmode:
36207 case V4DImode:
36208 case V4SFmode:
36209 case V4SImode:
36210 case V2DFmode:
36211 case V2DImode:
36212 n = GET_MODE_NUNITS (mode);
36213 for (i = 0; i < n; i++)
36214 ops[i] = XVECEXP (vals, 0, i);
36215 ix86_expand_vector_init_concat (mode, target, ops, n);
36216 return;
36218 case V32QImode:
36219 half_mode = V16QImode;
36220 goto half;
36222 case V16HImode:
36223 half_mode = V8HImode;
36224 goto half;
36226 half:
36227 n = GET_MODE_NUNITS (mode);
36228 for (i = 0; i < n; i++)
36229 ops[i] = XVECEXP (vals, 0, i);
36230 op0 = gen_reg_rtx (half_mode);
36231 op1 = gen_reg_rtx (half_mode);
36232 ix86_expand_vector_init_interleave (half_mode, op0, ops,
36233 n >> 2);
36234 ix86_expand_vector_init_interleave (half_mode, op1,
36235 &ops [n >> 1], n >> 2);
36236 emit_insn (gen_rtx_SET (VOIDmode, target,
36237 gen_rtx_VEC_CONCAT (mode, op0, op1)));
36238 return;
36240 case V16QImode:
36241 if (!TARGET_SSE4_1)
36242 break;
36243 /* FALLTHRU */
36245 case V8HImode:
36246 if (!TARGET_SSE2)
36247 break;
36249 /* Don't use ix86_expand_vector_init_interleave if we can't
36250 move from GPR to SSE register directly. */
36251 if (!TARGET_INTER_UNIT_MOVES)
36252 break;
36254 n = GET_MODE_NUNITS (mode);
36255 for (i = 0; i < n; i++)
36256 ops[i] = XVECEXP (vals, 0, i);
36257 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
36258 return;
36260 case V4HImode:
36261 case V8QImode:
36262 break;
36264 default:
36265 gcc_unreachable ();
36269 int i, j, n_elts, n_words, n_elt_per_word;
36270 enum machine_mode inner_mode;
36271 rtx words[4], shift;
36273 inner_mode = GET_MODE_INNER (mode);
36274 n_elts = GET_MODE_NUNITS (mode);
36275 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
36276 n_elt_per_word = n_elts / n_words;
36277 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
36279 for (i = 0; i < n_words; ++i)
36281 rtx word = NULL_RTX;
36283 for (j = 0; j < n_elt_per_word; ++j)
36285 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
36286 elt = convert_modes (word_mode, inner_mode, elt, true);
36288 if (j == 0)
36289 word = elt;
36290 else
36292 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
36293 word, 1, OPTAB_LIB_WIDEN);
36294 word = expand_simple_binop (word_mode, IOR, word, elt,
36295 word, 1, OPTAB_LIB_WIDEN);
36299 words[i] = word;
36302 if (n_words == 1)
36303 emit_move_insn (target, gen_lowpart (mode, words[0]));
36304 else if (n_words == 2)
36306 rtx tmp = gen_reg_rtx (mode);
36307 emit_clobber (tmp);
36308 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
36309 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
36310 emit_move_insn (target, tmp);
36312 else if (n_words == 4)
36314 rtx tmp = gen_reg_rtx (V4SImode);
36315 gcc_assert (word_mode == SImode);
36316 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
36317 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
36318 emit_move_insn (target, gen_lowpart (mode, tmp));
36320 else
36321 gcc_unreachable ();
36325 /* Initialize vector TARGET via VALS. Suppress the use of MMX
36326 instructions unless MMX_OK is true. */
36328 void
36329 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
36331 enum machine_mode mode = GET_MODE (target);
36332 enum machine_mode inner_mode = GET_MODE_INNER (mode);
36333 int n_elts = GET_MODE_NUNITS (mode);
36334 int n_var = 0, one_var = -1;
36335 bool all_same = true, all_const_zero = true;
36336 int i;
36337 rtx x;
36339 for (i = 0; i < n_elts; ++i)
36341 x = XVECEXP (vals, 0, i);
36342 if (!(CONST_INT_P (x)
36343 || GET_CODE (x) == CONST_DOUBLE
36344 || GET_CODE (x) == CONST_FIXED))
36345 n_var++, one_var = i;
36346 else if (x != CONST0_RTX (inner_mode))
36347 all_const_zero = false;
36348 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
36349 all_same = false;
36352 /* Constants are best loaded from the constant pool. */
36353 if (n_var == 0)
36355 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
36356 return;
36359 /* If all values are identical, broadcast the value. */
36360 if (all_same
36361 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
36362 XVECEXP (vals, 0, 0)))
36363 return;
36365 /* Values where only one field is non-constant are best loaded from
36366 the pool and overwritten via move later. */
36367 if (n_var == 1)
36369 if (all_const_zero
36370 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
36371 XVECEXP (vals, 0, one_var),
36372 one_var))
36373 return;
36375 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
36376 return;
36379 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
36382 void
36383 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
36385 enum machine_mode mode = GET_MODE (target);
36386 enum machine_mode inner_mode = GET_MODE_INNER (mode);
36387 enum machine_mode half_mode;
36388 bool use_vec_merge = false;
36389 rtx tmp;
36390 static rtx (*gen_extract[6][2]) (rtx, rtx)
36392 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
36393 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
36394 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
36395 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
36396 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
36397 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
36399 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
36401 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
36402 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
36403 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
36404 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
36405 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
36406 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
36408 int i, j, n;
36410 switch (mode)
36412 case V2SFmode:
36413 case V2SImode:
36414 if (mmx_ok)
36416 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
36417 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
36418 if (elt == 0)
36419 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
36420 else
36421 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
36422 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36423 return;
36425 break;
36427 case V2DImode:
36428 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
36429 if (use_vec_merge)
36430 break;
36432 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
36433 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
36434 if (elt == 0)
36435 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
36436 else
36437 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
36438 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36439 return;
36441 case V2DFmode:
36443 rtx op0, op1;
36445 /* For the two element vectors, we implement a VEC_CONCAT with
36446 the extraction of the other element. */
36448 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
36449 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
36451 if (elt == 0)
36452 op0 = val, op1 = tmp;
36453 else
36454 op0 = tmp, op1 = val;
36456 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
36457 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36459 return;
36461 case V4SFmode:
36462 use_vec_merge = TARGET_SSE4_1;
36463 if (use_vec_merge)
36464 break;
36466 switch (elt)
36468 case 0:
36469 use_vec_merge = true;
36470 break;
36472 case 1:
36473 /* tmp = target = A B C D */
36474 tmp = copy_to_reg (target);
36475 /* target = A A B B */
36476 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
36477 /* target = X A B B */
36478 ix86_expand_vector_set (false, target, val, 0);
36479 /* target = A X C D */
36480 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
36481 const1_rtx, const0_rtx,
36482 GEN_INT (2+4), GEN_INT (3+4)));
36483 return;
36485 case 2:
36486 /* tmp = target = A B C D */
36487 tmp = copy_to_reg (target);
36488 /* tmp = X B C D */
36489 ix86_expand_vector_set (false, tmp, val, 0);
36490 /* target = A B X D */
36491 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
36492 const0_rtx, const1_rtx,
36493 GEN_INT (0+4), GEN_INT (3+4)));
36494 return;
36496 case 3:
36497 /* tmp = target = A B C D */
36498 tmp = copy_to_reg (target);
36499 /* tmp = X B C D */
36500 ix86_expand_vector_set (false, tmp, val, 0);
36501 /* target = A B X D */
36502 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
36503 const0_rtx, const1_rtx,
36504 GEN_INT (2+4), GEN_INT (0+4)));
36505 return;
36507 default:
36508 gcc_unreachable ();
36510 break;
36512 case V4SImode:
36513 use_vec_merge = TARGET_SSE4_1;
36514 if (use_vec_merge)
36515 break;
36517 /* Element 0 handled by vec_merge below. */
36518 if (elt == 0)
36520 use_vec_merge = true;
36521 break;
36524 if (TARGET_SSE2)
36526 /* With SSE2, use integer shuffles to swap element 0 and ELT,
36527 store into element 0, then shuffle them back. */
36529 rtx order[4];
36531 order[0] = GEN_INT (elt);
36532 order[1] = const1_rtx;
36533 order[2] = const2_rtx;
36534 order[3] = GEN_INT (3);
36535 order[elt] = const0_rtx;
36537 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
36538 order[1], order[2], order[3]));
36540 ix86_expand_vector_set (false, target, val, 0);
36542 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
36543 order[1], order[2], order[3]));
36545 else
36547 /* For SSE1, we have to reuse the V4SF code. */
36548 ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
36549 gen_lowpart (SFmode, val), elt);
36551 return;
36553 case V8HImode:
36554 use_vec_merge = TARGET_SSE2;
36555 break;
36556 case V4HImode:
36557 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
36558 break;
36560 case V16QImode:
36561 use_vec_merge = TARGET_SSE4_1;
36562 break;
36564 case V8QImode:
36565 break;
36567 case V32QImode:
36568 half_mode = V16QImode;
36569 j = 0;
36570 n = 16;
36571 goto half;
36573 case V16HImode:
36574 half_mode = V8HImode;
36575 j = 1;
36576 n = 8;
36577 goto half;
36579 case V8SImode:
36580 half_mode = V4SImode;
36581 j = 2;
36582 n = 4;
36583 goto half;
36585 case V4DImode:
36586 half_mode = V2DImode;
36587 j = 3;
36588 n = 2;
36589 goto half;
36591 case V8SFmode:
36592 half_mode = V4SFmode;
36593 j = 4;
36594 n = 4;
36595 goto half;
36597 case V4DFmode:
36598 half_mode = V2DFmode;
36599 j = 5;
36600 n = 2;
36601 goto half;
36603 half:
36604 /* Compute offset. */
36605 i = elt / n;
36606 elt %= n;
36608 gcc_assert (i <= 1);
36610 /* Extract the half. */
36611 tmp = gen_reg_rtx (half_mode);
36612 emit_insn (gen_extract[j][i] (tmp, target));
36614 /* Put val in tmp at elt. */
36615 ix86_expand_vector_set (false, tmp, val, elt);
36617 /* Put it back. */
36618 emit_insn (gen_insert[j][i] (target, target, tmp));
36619 return;
36621 default:
36622 break;
36625 if (use_vec_merge)
36627 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
36628 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
36629 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36631 else
36633 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
36635 emit_move_insn (mem, target);
36637 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
36638 emit_move_insn (tmp, val);
36640 emit_move_insn (target, mem);
36644 void
36645 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
36647 enum machine_mode mode = GET_MODE (vec);
36648 enum machine_mode inner_mode = GET_MODE_INNER (mode);
36649 bool use_vec_extr = false;
36650 rtx tmp;
36652 switch (mode)
36654 case V2SImode:
36655 case V2SFmode:
36656 if (!mmx_ok)
36657 break;
36658 /* FALLTHRU */
36660 case V2DFmode:
36661 case V2DImode:
36662 use_vec_extr = true;
36663 break;
36665 case V4SFmode:
36666 use_vec_extr = TARGET_SSE4_1;
36667 if (use_vec_extr)
36668 break;
36670 switch (elt)
36672 case 0:
36673 tmp = vec;
36674 break;
36676 case 1:
36677 case 3:
36678 tmp = gen_reg_rtx (mode);
36679 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
36680 GEN_INT (elt), GEN_INT (elt),
36681 GEN_INT (elt+4), GEN_INT (elt+4)));
36682 break;
36684 case 2:
36685 tmp = gen_reg_rtx (mode);
36686 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
36687 break;
36689 default:
36690 gcc_unreachable ();
36692 vec = tmp;
36693 use_vec_extr = true;
36694 elt = 0;
36695 break;
36697 case V4SImode:
36698 use_vec_extr = TARGET_SSE4_1;
36699 if (use_vec_extr)
36700 break;
36702 if (TARGET_SSE2)
36704 switch (elt)
36706 case 0:
36707 tmp = vec;
36708 break;
36710 case 1:
36711 case 3:
36712 tmp = gen_reg_rtx (mode);
36713 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
36714 GEN_INT (elt), GEN_INT (elt),
36715 GEN_INT (elt), GEN_INT (elt)));
36716 break;
36718 case 2:
36719 tmp = gen_reg_rtx (mode);
36720 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
36721 break;
36723 default:
36724 gcc_unreachable ();
36726 vec = tmp;
36727 use_vec_extr = true;
36728 elt = 0;
36730 else
36732 /* For SSE1, we have to reuse the V4SF code. */
36733 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
36734 gen_lowpart (V4SFmode, vec), elt);
36735 return;
36737 break;
36739 case V8HImode:
36740 use_vec_extr = TARGET_SSE2;
36741 break;
36742 case V4HImode:
36743 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
36744 break;
36746 case V16QImode:
36747 use_vec_extr = TARGET_SSE4_1;
36748 break;
36750 case V8SFmode:
36751 if (TARGET_AVX)
36753 tmp = gen_reg_rtx (V4SFmode);
36754 if (elt < 4)
36755 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
36756 else
36757 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
36758 ix86_expand_vector_extract (false, target, tmp, elt & 3);
36759 return;
36761 break;
36763 case V4DFmode:
36764 if (TARGET_AVX)
36766 tmp = gen_reg_rtx (V2DFmode);
36767 if (elt < 2)
36768 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
36769 else
36770 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
36771 ix86_expand_vector_extract (false, target, tmp, elt & 1);
36772 return;
36774 break;
36776 case V32QImode:
36777 if (TARGET_AVX)
36779 tmp = gen_reg_rtx (V16QImode);
36780 if (elt < 16)
36781 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
36782 else
36783 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
36784 ix86_expand_vector_extract (false, target, tmp, elt & 15);
36785 return;
36787 break;
36789 case V16HImode:
36790 if (TARGET_AVX)
36792 tmp = gen_reg_rtx (V8HImode);
36793 if (elt < 8)
36794 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
36795 else
36796 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
36797 ix86_expand_vector_extract (false, target, tmp, elt & 7);
36798 return;
36800 break;
36802 case V8SImode:
36803 if (TARGET_AVX)
36805 tmp = gen_reg_rtx (V4SImode);
36806 if (elt < 4)
36807 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
36808 else
36809 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
36810 ix86_expand_vector_extract (false, target, tmp, elt & 3);
36811 return;
36813 break;
36815 case V4DImode:
36816 if (TARGET_AVX)
36818 tmp = gen_reg_rtx (V2DImode);
36819 if (elt < 2)
36820 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
36821 else
36822 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
36823 ix86_expand_vector_extract (false, target, tmp, elt & 1);
36824 return;
36826 break;
36828 case V8QImode:
36829 /* ??? Could extract the appropriate HImode element and shift. */
36830 default:
36831 break;
36834 if (use_vec_extr)
36836 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
36837 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
36839 /* Let the rtl optimizers know about the zero extension performed. */
36840 if (inner_mode == QImode || inner_mode == HImode)
36842 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
36843 target = gen_lowpart (SImode, target);
36846 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36848 else
36850 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
36852 emit_move_insn (mem, vec);
36854 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
36855 emit_move_insn (target, tmp);
36859 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
36860 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
36861 The upper bits of DEST are undefined, though they shouldn't cause
36862 exceptions (some bits from src or all zeros are ok). */
36864 static void
36865 emit_reduc_half (rtx dest, rtx src, int i)
36867 rtx tem;
36868 switch (GET_MODE (src))
36870 case V4SFmode:
36871 if (i == 128)
36872 tem = gen_sse_movhlps (dest, src, src);
36873 else
36874 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
36875 GEN_INT (1 + 4), GEN_INT (1 + 4));
36876 break;
36877 case V2DFmode:
36878 tem = gen_vec_interleave_highv2df (dest, src, src);
36879 break;
36880 case V16QImode:
36881 case V8HImode:
36882 case V4SImode:
36883 case V2DImode:
36884 tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest),
36885 gen_lowpart (V1TImode, src),
36886 GEN_INT (i / 2));
36887 break;
36888 case V8SFmode:
36889 if (i == 256)
36890 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
36891 else
36892 tem = gen_avx_shufps256 (dest, src, src,
36893 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
36894 break;
36895 case V4DFmode:
36896 if (i == 256)
36897 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
36898 else
36899 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
36900 break;
36901 case V32QImode:
36902 case V16HImode:
36903 case V8SImode:
36904 case V4DImode:
36905 if (i == 256)
36906 tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest),
36907 gen_lowpart (V4DImode, src),
36908 gen_lowpart (V4DImode, src),
36909 const1_rtx);
36910 else
36911 tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest),
36912 gen_lowpart (V2TImode, src),
36913 GEN_INT (i / 2));
36914 break;
36915 default:
36916 gcc_unreachable ();
36918 emit_insn (tem);
36921 /* Expand a vector reduction. FN is the binary pattern to reduce;
36922 DEST is the destination; IN is the input vector. */
36924 void
36925 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
36927 rtx half, dst, vec = in;
36928 enum machine_mode mode = GET_MODE (in);
36929 int i;
36931 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
36932 if (TARGET_SSE4_1
36933 && mode == V8HImode
36934 && fn == gen_uminv8hi3)
36936 emit_insn (gen_sse4_1_phminposuw (dest, in));
36937 return;
36940 for (i = GET_MODE_BITSIZE (mode);
36941 i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
36942 i >>= 1)
36944 half = gen_reg_rtx (mode);
36945 emit_reduc_half (half, vec, i);
36946 if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
36947 dst = dest;
36948 else
36949 dst = gen_reg_rtx (mode);
36950 emit_insn (fn (dst, half, vec));
36951 vec = dst;
36955 /* Target hook for scalar_mode_supported_p. */
36956 static bool
36957 ix86_scalar_mode_supported_p (enum machine_mode mode)
36959 if (DECIMAL_FLOAT_MODE_P (mode))
36960 return default_decimal_float_supported_p ();
36961 else if (mode == TFmode)
36962 return true;
36963 else
36964 return default_scalar_mode_supported_p (mode);
36967 /* Implements target hook vector_mode_supported_p. */
36968 static bool
36969 ix86_vector_mode_supported_p (enum machine_mode mode)
36971 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
36972 return true;
36973 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
36974 return true;
36975 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
36976 return true;
36977 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
36978 return true;
36979 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
36980 return true;
36981 return false;
36984 /* Target hook for c_mode_for_suffix. */
36985 static enum machine_mode
36986 ix86_c_mode_for_suffix (char suffix)
36988 if (suffix == 'q')
36989 return TFmode;
36990 if (suffix == 'w')
36991 return XFmode;
36993 return VOIDmode;
36996 /* Worker function for TARGET_MD_ASM_CLOBBERS.
36998 We do this in the new i386 backend to maintain source compatibility
36999 with the old cc0-based compiler. */
37001 static tree
37002 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
37003 tree inputs ATTRIBUTE_UNUSED,
37004 tree clobbers)
37006 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
37007 clobbers);
37008 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
37009 clobbers);
37010 return clobbers;
37013 /* Implements target vector targetm.asm.encode_section_info. */
37015 static void ATTRIBUTE_UNUSED
37016 ix86_encode_section_info (tree decl, rtx rtl, int first)
37018 default_encode_section_info (decl, rtl, first);
37020 if (TREE_CODE (decl) == VAR_DECL
37021 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
37022 && ix86_in_large_data_p (decl))
37023 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
37026 /* Worker function for REVERSE_CONDITION. */
37028 enum rtx_code
37029 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
37031 return (mode != CCFPmode && mode != CCFPUmode
37032 ? reverse_condition (code)
37033 : reverse_condition_maybe_unordered (code));
37036 /* Output code to perform an x87 FP register move, from OPERANDS[1]
37037 to OPERANDS[0]. */
37039 const char *
37040 output_387_reg_move (rtx insn, rtx *operands)
37042 if (REG_P (operands[0]))
37044 if (REG_P (operands[1])
37045 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
37047 if (REGNO (operands[0]) == FIRST_STACK_REG)
37048 return output_387_ffreep (operands, 0);
37049 return "fstp\t%y0";
37051 if (STACK_TOP_P (operands[0]))
37052 return "fld%Z1\t%y1";
37053 return "fst\t%y0";
37055 else if (MEM_P (operands[0]))
37057 gcc_assert (REG_P (operands[1]));
37058 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
37059 return "fstp%Z0\t%y0";
37060 else
37062 /* There is no non-popping store to memory for XFmode.
37063 So if we need one, follow the store with a load. */
37064 if (GET_MODE (operands[0]) == XFmode)
37065 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
37066 else
37067 return "fst%Z0\t%y0";
37070 else
37071 gcc_unreachable();
37074 /* Output code to perform a conditional jump to LABEL, if C2 flag in
37075 FP status register is set. */
37077 void
37078 ix86_emit_fp_unordered_jump (rtx label)
37080 rtx reg = gen_reg_rtx (HImode);
37081 rtx temp;
37083 emit_insn (gen_x86_fnstsw_1 (reg));
37085 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
37087 emit_insn (gen_x86_sahf_1 (reg));
37089 temp = gen_rtx_REG (CCmode, FLAGS_REG);
37090 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
37092 else
37094 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
37096 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
37097 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
37100 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
37101 gen_rtx_LABEL_REF (VOIDmode, label),
37102 pc_rtx);
37103 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
37105 emit_jump_insn (temp);
37106 predict_jump (REG_BR_PROB_BASE * 10 / 100);
37109 /* Output code to perform a log1p XFmode calculation. */
37111 void ix86_emit_i387_log1p (rtx op0, rtx op1)
37113 rtx label1 = gen_label_rtx ();
37114 rtx label2 = gen_label_rtx ();
37116 rtx tmp = gen_reg_rtx (XFmode);
37117 rtx tmp2 = gen_reg_rtx (XFmode);
37118 rtx test;
37120 emit_insn (gen_absxf2 (tmp, op1));
37121 test = gen_rtx_GE (VOIDmode, tmp,
37122 CONST_DOUBLE_FROM_REAL_VALUE (
37123 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
37124 XFmode));
37125 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
37127 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
37128 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
37129 emit_jump (label2);
37131 emit_label (label1);
37132 emit_move_insn (tmp, CONST1_RTX (XFmode));
37133 emit_insn (gen_addxf3 (tmp, op1, tmp));
37134 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
37135 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
37137 emit_label (label2);
37140 /* Emit code for round calculation. */
37141 void ix86_emit_i387_round (rtx op0, rtx op1)
37143 enum machine_mode inmode = GET_MODE (op1);
37144 enum machine_mode outmode = GET_MODE (op0);
37145 rtx e1, e2, res, tmp, tmp1, half;
37146 rtx scratch = gen_reg_rtx (HImode);
37147 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
37148 rtx jump_label = gen_label_rtx ();
37149 rtx insn;
37150 rtx (*gen_abs) (rtx, rtx);
37151 rtx (*gen_neg) (rtx, rtx);
37153 switch (inmode)
37155 case SFmode:
37156 gen_abs = gen_abssf2;
37157 break;
37158 case DFmode:
37159 gen_abs = gen_absdf2;
37160 break;
37161 case XFmode:
37162 gen_abs = gen_absxf2;
37163 break;
37164 default:
37165 gcc_unreachable ();
37168 switch (outmode)
37170 case SFmode:
37171 gen_neg = gen_negsf2;
37172 break;
37173 case DFmode:
37174 gen_neg = gen_negdf2;
37175 break;
37176 case XFmode:
37177 gen_neg = gen_negxf2;
37178 break;
37179 case HImode:
37180 gen_neg = gen_neghi2;
37181 break;
37182 case SImode:
37183 gen_neg = gen_negsi2;
37184 break;
37185 case DImode:
37186 gen_neg = gen_negdi2;
37187 break;
37188 default:
37189 gcc_unreachable ();
37192 e1 = gen_reg_rtx (inmode);
37193 e2 = gen_reg_rtx (inmode);
37194 res = gen_reg_rtx (outmode);
37196 half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
37198 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
37200 /* scratch = fxam(op1) */
37201 emit_insn (gen_rtx_SET (VOIDmode, scratch,
37202 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
37203 UNSPEC_FXAM)));
37204 /* e1 = fabs(op1) */
37205 emit_insn (gen_abs (e1, op1));
37207 /* e2 = e1 + 0.5 */
37208 half = force_reg (inmode, half);
37209 emit_insn (gen_rtx_SET (VOIDmode, e2,
37210 gen_rtx_PLUS (inmode, e1, half)));
37212 /* res = floor(e2) */
37213 if (inmode != XFmode)
37215 tmp1 = gen_reg_rtx (XFmode);
37217 emit_insn (gen_rtx_SET (VOIDmode, tmp1,
37218 gen_rtx_FLOAT_EXTEND (XFmode, e2)));
37220 else
37221 tmp1 = e2;
37223 switch (outmode)
37225 case SFmode:
37226 case DFmode:
37228 rtx tmp0 = gen_reg_rtx (XFmode);
37230 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
37232 emit_insn (gen_rtx_SET (VOIDmode, res,
37233 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
37234 UNSPEC_TRUNC_NOOP)));
37236 break;
37237 case XFmode:
37238 emit_insn (gen_frndintxf2_floor (res, tmp1));
37239 break;
37240 case HImode:
37241 emit_insn (gen_lfloorxfhi2 (res, tmp1));
37242 break;
37243 case SImode:
37244 emit_insn (gen_lfloorxfsi2 (res, tmp1));
37245 break;
37246 case DImode:
37247 emit_insn (gen_lfloorxfdi2 (res, tmp1));
37248 break;
37249 default:
37250 gcc_unreachable ();
37253 /* flags = signbit(a) */
37254 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
37256 /* if (flags) then res = -res */
37257 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
37258 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
37259 gen_rtx_LABEL_REF (VOIDmode, jump_label),
37260 pc_rtx);
37261 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
37262 predict_jump (REG_BR_PROB_BASE * 50 / 100);
37263 JUMP_LABEL (insn) = jump_label;
37265 emit_insn (gen_neg (res, res));
37267 emit_label (jump_label);
37268 LABEL_NUSES (jump_label) = 1;
37270 emit_move_insn (op0, res);
37273 /* Output code to perform a Newton-Rhapson approximation of a single precision
37274 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
37276 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
37278 rtx x0, x1, e0, e1;
37280 x0 = gen_reg_rtx (mode);
37281 e0 = gen_reg_rtx (mode);
37282 e1 = gen_reg_rtx (mode);
37283 x1 = gen_reg_rtx (mode);
37285 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
37287 b = force_reg (mode, b);
37289 /* x0 = rcp(b) estimate */
37290 emit_insn (gen_rtx_SET (VOIDmode, x0,
37291 gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
37292 UNSPEC_RCP)));
37293 /* e0 = x0 * b */
37294 emit_insn (gen_rtx_SET (VOIDmode, e0,
37295 gen_rtx_MULT (mode, x0, b)));
37297 /* e0 = x0 * e0 */
37298 emit_insn (gen_rtx_SET (VOIDmode, e0,
37299 gen_rtx_MULT (mode, x0, e0)));
37301 /* e1 = x0 + x0 */
37302 emit_insn (gen_rtx_SET (VOIDmode, e1,
37303 gen_rtx_PLUS (mode, x0, x0)));
37305 /* x1 = e1 - e0 */
37306 emit_insn (gen_rtx_SET (VOIDmode, x1,
37307 gen_rtx_MINUS (mode, e1, e0)));
37309 /* res = a * x1 */
37310 emit_insn (gen_rtx_SET (VOIDmode, res,
37311 gen_rtx_MULT (mode, a, x1)));
37314 /* Output code to perform a Newton-Rhapson approximation of a
37315 single precision floating point [reciprocal] square root. */
37317 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
37318 bool recip)
37320 rtx x0, e0, e1, e2, e3, mthree, mhalf;
37321 REAL_VALUE_TYPE r;
37323 x0 = gen_reg_rtx (mode);
37324 e0 = gen_reg_rtx (mode);
37325 e1 = gen_reg_rtx (mode);
37326 e2 = gen_reg_rtx (mode);
37327 e3 = gen_reg_rtx (mode);
37329 real_from_integer (&r, VOIDmode, -3, -1, 0);
37330 mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
37332 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
37333 mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
37335 if (VECTOR_MODE_P (mode))
37337 mthree = ix86_build_const_vector (mode, true, mthree);
37338 mhalf = ix86_build_const_vector (mode, true, mhalf);
37341 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
37342 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
37344 a = force_reg (mode, a);
37346 /* x0 = rsqrt(a) estimate */
37347 emit_insn (gen_rtx_SET (VOIDmode, x0,
37348 gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
37349 UNSPEC_RSQRT)));
37351 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
37352 if (!recip)
37354 rtx zero, mask;
37356 zero = gen_reg_rtx (mode);
37357 mask = gen_reg_rtx (mode);
37359 zero = force_reg (mode, CONST0_RTX(mode));
37360 emit_insn (gen_rtx_SET (VOIDmode, mask,
37361 gen_rtx_NE (mode, zero, a)));
37363 emit_insn (gen_rtx_SET (VOIDmode, x0,
37364 gen_rtx_AND (mode, x0, mask)));
37367 /* e0 = x0 * a */
37368 emit_insn (gen_rtx_SET (VOIDmode, e0,
37369 gen_rtx_MULT (mode, x0, a)));
37370 /* e1 = e0 * x0 */
37371 emit_insn (gen_rtx_SET (VOIDmode, e1,
37372 gen_rtx_MULT (mode, e0, x0)));
37374 /* e2 = e1 - 3. */
37375 mthree = force_reg (mode, mthree);
37376 emit_insn (gen_rtx_SET (VOIDmode, e2,
37377 gen_rtx_PLUS (mode, e1, mthree)));
37379 mhalf = force_reg (mode, mhalf);
37380 if (recip)
37381 /* e3 = -.5 * x0 */
37382 emit_insn (gen_rtx_SET (VOIDmode, e3,
37383 gen_rtx_MULT (mode, x0, mhalf)));
37384 else
37385 /* e3 = -.5 * e0 */
37386 emit_insn (gen_rtx_SET (VOIDmode, e3,
37387 gen_rtx_MULT (mode, e0, mhalf)));
37388 /* ret = e2 * e3 */
37389 emit_insn (gen_rtx_SET (VOIDmode, res,
37390 gen_rtx_MULT (mode, e2, e3)));
37393 #ifdef TARGET_SOLARIS
37394 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
37396 static void
37397 i386_solaris_elf_named_section (const char *name, unsigned int flags,
37398 tree decl)
37400 /* With Binutils 2.15, the "@unwind" marker must be specified on
37401 every occurrence of the ".eh_frame" section, not just the first
37402 one. */
37403 if (TARGET_64BIT
37404 && strcmp (name, ".eh_frame") == 0)
37406 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
37407 flags & SECTION_WRITE ? "aw" : "a");
37408 return;
37411 #ifndef USE_GAS
37412 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
37414 solaris_elf_asm_comdat_section (name, flags, decl);
37415 return;
37417 #endif
37419 default_elf_asm_named_section (name, flags, decl);
37421 #endif /* TARGET_SOLARIS */
37423 /* Return the mangling of TYPE if it is an extended fundamental type. */
37425 static const char *
37426 ix86_mangle_type (const_tree type)
37428 type = TYPE_MAIN_VARIANT (type);
37430 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
37431 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
37432 return NULL;
37434 switch (TYPE_MODE (type))
37436 case TFmode:
37437 /* __float128 is "g". */
37438 return "g";
37439 case XFmode:
37440 /* "long double" or __float80 is "e". */
37441 return "e";
37442 default:
37443 return NULL;
37447 /* For 32-bit code we can save PIC register setup by using
37448 __stack_chk_fail_local hidden function instead of calling
37449 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
37450 register, so it is better to call __stack_chk_fail directly. */
37452 static tree ATTRIBUTE_UNUSED
37453 ix86_stack_protect_fail (void)
37455 return TARGET_64BIT
37456 ? default_external_stack_protect_fail ()
37457 : default_hidden_stack_protect_fail ();
37460 /* Select a format to encode pointers in exception handling data. CODE
37461 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
37462 true if the symbol may be affected by dynamic relocations.
37464 ??? All x86 object file formats are capable of representing this.
37465 After all, the relocation needed is the same as for the call insn.
37466 Whether or not a particular assembler allows us to enter such, I
37467 guess we'll have to see. */
37469 asm_preferred_eh_data_format (int code, int global)
37471 if (flag_pic)
37473 int type = DW_EH_PE_sdata8;
37474 if (!TARGET_64BIT
37475 || ix86_cmodel == CM_SMALL_PIC
37476 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
37477 type = DW_EH_PE_sdata4;
37478 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
37480 if (ix86_cmodel == CM_SMALL
37481 || (ix86_cmodel == CM_MEDIUM && code))
37482 return DW_EH_PE_udata4;
37483 return DW_EH_PE_absptr;
37486 /* Expand copysign from SIGN to the positive value ABS_VALUE
37487 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
37488 the sign-bit. */
37489 static void
37490 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
37492 enum machine_mode mode = GET_MODE (sign);
37493 rtx sgn = gen_reg_rtx (mode);
37494 if (mask == NULL_RTX)
37496 enum machine_mode vmode;
37498 if (mode == SFmode)
37499 vmode = V4SFmode;
37500 else if (mode == DFmode)
37501 vmode = V2DFmode;
37502 else
37503 vmode = mode;
37505 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
37506 if (!VECTOR_MODE_P (mode))
37508 /* We need to generate a scalar mode mask in this case. */
37509 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
37510 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
37511 mask = gen_reg_rtx (mode);
37512 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
37515 else
37516 mask = gen_rtx_NOT (mode, mask);
37517 emit_insn (gen_rtx_SET (VOIDmode, sgn,
37518 gen_rtx_AND (mode, mask, sign)));
37519 emit_insn (gen_rtx_SET (VOIDmode, result,
37520 gen_rtx_IOR (mode, abs_value, sgn)));
37523 /* Expand fabs (OP0) and return a new rtx that holds the result. The
37524 mask for masking out the sign-bit is stored in *SMASK, if that is
37525 non-null. */
37526 static rtx
37527 ix86_expand_sse_fabs (rtx op0, rtx *smask)
37529 enum machine_mode vmode, mode = GET_MODE (op0);
37530 rtx xa, mask;
37532 xa = gen_reg_rtx (mode);
37533 if (mode == SFmode)
37534 vmode = V4SFmode;
37535 else if (mode == DFmode)
37536 vmode = V2DFmode;
37537 else
37538 vmode = mode;
37539 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
37540 if (!VECTOR_MODE_P (mode))
37542 /* We need to generate a scalar mode mask in this case. */
37543 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
37544 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
37545 mask = gen_reg_rtx (mode);
37546 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
37548 emit_insn (gen_rtx_SET (VOIDmode, xa,
37549 gen_rtx_AND (mode, op0, mask)));
37551 if (smask)
37552 *smask = mask;
37554 return xa;
37557 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
37558 swapping the operands if SWAP_OPERANDS is true. The expanded
37559 code is a forward jump to a newly created label in case the
37560 comparison is true. The generated label rtx is returned. */
37561 static rtx
37562 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
37563 bool swap_operands)
37565 rtx label, tmp;
37567 if (swap_operands)
37569 tmp = op0;
37570 op0 = op1;
37571 op1 = tmp;
37574 label = gen_label_rtx ();
37575 tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG);
37576 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37577 gen_rtx_COMPARE (CCFPUmode, op0, op1)));
37578 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
37579 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
37580 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
37581 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
37582 JUMP_LABEL (tmp) = label;
37584 return label;
37587 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
37588 using comparison code CODE. Operands are swapped for the comparison if
37589 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
37590 static rtx
37591 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
37592 bool swap_operands)
37594 rtx (*insn)(rtx, rtx, rtx, rtx);
37595 enum machine_mode mode = GET_MODE (op0);
37596 rtx mask = gen_reg_rtx (mode);
37598 if (swap_operands)
37600 rtx tmp = op0;
37601 op0 = op1;
37602 op1 = tmp;
37605 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
37607 emit_insn (insn (mask, op0, op1,
37608 gen_rtx_fmt_ee (code, mode, op0, op1)));
37609 return mask;
37612 /* Generate and return a rtx of mode MODE for 2**n where n is the number
37613 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
37614 static rtx
37615 ix86_gen_TWO52 (enum machine_mode mode)
37617 REAL_VALUE_TYPE TWO52r;
37618 rtx TWO52;
37620 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
37621 TWO52 = const_double_from_real_value (TWO52r, mode);
37622 TWO52 = force_reg (mode, TWO52);
37624 return TWO52;
37627 /* Expand SSE sequence for computing lround from OP1 storing
37628 into OP0. */
37629 void
37630 ix86_expand_lround (rtx op0, rtx op1)
37632 /* C code for the stuff we're doing below:
37633 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
37634 return (long)tmp;
37636 enum machine_mode mode = GET_MODE (op1);
37637 const struct real_format *fmt;
37638 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
37639 rtx adj;
37641 /* load nextafter (0.5, 0.0) */
37642 fmt = REAL_MODE_FORMAT (mode);
37643 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
37644 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
37646 /* adj = copysign (0.5, op1) */
37647 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
37648 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
37650 /* adj = op1 + adj */
37651 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
37653 /* op0 = (imode)adj */
37654 expand_fix (op0, adj, 0);
37657 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
37658 into OPERAND0. */
37659 void
37660 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
37662 /* C code for the stuff we're doing below (for do_floor):
37663 xi = (long)op1;
37664 xi -= (double)xi > op1 ? 1 : 0;
37665 return xi;
37667 enum machine_mode fmode = GET_MODE (op1);
37668 enum machine_mode imode = GET_MODE (op0);
37669 rtx ireg, freg, label, tmp;
37671 /* reg = (long)op1 */
37672 ireg = gen_reg_rtx (imode);
37673 expand_fix (ireg, op1, 0);
37675 /* freg = (double)reg */
37676 freg = gen_reg_rtx (fmode);
37677 expand_float (freg, ireg, 0);
37679 /* ireg = (freg > op1) ? ireg - 1 : ireg */
37680 label = ix86_expand_sse_compare_and_jump (UNLE,
37681 freg, op1, !do_floor);
37682 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
37683 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
37684 emit_move_insn (ireg, tmp);
37686 emit_label (label);
37687 LABEL_NUSES (label) = 1;
37689 emit_move_insn (op0, ireg);
37692 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
37693 result in OPERAND0. */
37694 void
37695 ix86_expand_rint (rtx operand0, rtx operand1)
37697 /* C code for the stuff we're doing below:
37698 xa = fabs (operand1);
37699 if (!isless (xa, 2**52))
37700 return operand1;
37701 xa = xa + 2**52 - 2**52;
37702 return copysign (xa, operand1);
37704 enum machine_mode mode = GET_MODE (operand0);
37705 rtx res, xa, label, TWO52, mask;
37707 res = gen_reg_rtx (mode);
37708 emit_move_insn (res, operand1);
37710 /* xa = abs (operand1) */
37711 xa = ix86_expand_sse_fabs (res, &mask);
37713 /* if (!isless (xa, TWO52)) goto label; */
37714 TWO52 = ix86_gen_TWO52 (mode);
37715 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37717 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
37718 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
37720 ix86_sse_copysign_to_positive (res, xa, res, mask);
37722 emit_label (label);
37723 LABEL_NUSES (label) = 1;
37725 emit_move_insn (operand0, res);
37728 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
37729 into OPERAND0. */
37730 void
37731 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
37733 /* C code for the stuff we expand below.
37734 double xa = fabs (x), x2;
37735 if (!isless (xa, TWO52))
37736 return x;
37737 xa = xa + TWO52 - TWO52;
37738 x2 = copysign (xa, x);
37739 Compensate. Floor:
37740 if (x2 > x)
37741 x2 -= 1;
37742 Compensate. Ceil:
37743 if (x2 < x)
37744 x2 -= -1;
37745 return x2;
37747 enum machine_mode mode = GET_MODE (operand0);
37748 rtx xa, TWO52, tmp, label, one, res, mask;
37750 TWO52 = ix86_gen_TWO52 (mode);
37752 /* Temporary for holding the result, initialized to the input
37753 operand to ease control flow. */
37754 res = gen_reg_rtx (mode);
37755 emit_move_insn (res, operand1);
37757 /* xa = abs (operand1) */
37758 xa = ix86_expand_sse_fabs (res, &mask);
37760 /* if (!isless (xa, TWO52)) goto label; */
37761 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37763 /* xa = xa + TWO52 - TWO52; */
37764 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
37765 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
37767 /* xa = copysign (xa, operand1) */
37768 ix86_sse_copysign_to_positive (xa, xa, res, mask);
37770 /* generate 1.0 or -1.0 */
37771 one = force_reg (mode,
37772 const_double_from_real_value (do_floor
37773 ? dconst1 : dconstm1, mode));
37775 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
37776 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
37777 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37778 gen_rtx_AND (mode, one, tmp)));
37779 /* We always need to subtract here to preserve signed zero. */
37780 tmp = expand_simple_binop (mode, MINUS,
37781 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
37782 emit_move_insn (res, tmp);
37784 emit_label (label);
37785 LABEL_NUSES (label) = 1;
37787 emit_move_insn (operand0, res);
37790 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
37791 into OPERAND0. */
37792 void
37793 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
37795 /* C code for the stuff we expand below.
37796 double xa = fabs (x), x2;
37797 if (!isless (xa, TWO52))
37798 return x;
37799 x2 = (double)(long)x;
37800 Compensate. Floor:
37801 if (x2 > x)
37802 x2 -= 1;
37803 Compensate. Ceil:
37804 if (x2 < x)
37805 x2 += 1;
37806 if (HONOR_SIGNED_ZEROS (mode))
37807 return copysign (x2, x);
37808 return x2;
37810 enum machine_mode mode = GET_MODE (operand0);
37811 rtx xa, xi, TWO52, tmp, label, one, res, mask;
37813 TWO52 = ix86_gen_TWO52 (mode);
37815 /* Temporary for holding the result, initialized to the input
37816 operand to ease control flow. */
37817 res = gen_reg_rtx (mode);
37818 emit_move_insn (res, operand1);
37820 /* xa = abs (operand1) */
37821 xa = ix86_expand_sse_fabs (res, &mask);
37823 /* if (!isless (xa, TWO52)) goto label; */
37824 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37826 /* xa = (double)(long)x */
37827 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
37828 expand_fix (xi, res, 0);
37829 expand_float (xa, xi, 0);
37831 /* generate 1.0 */
37832 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
37834 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
37835 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
37836 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37837 gen_rtx_AND (mode, one, tmp)));
37838 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
37839 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
37840 emit_move_insn (res, tmp);
37842 if (HONOR_SIGNED_ZEROS (mode))
37843 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
37845 emit_label (label);
37846 LABEL_NUSES (label) = 1;
37848 emit_move_insn (operand0, res);
37851 /* Expand SSE sequence for computing round from OPERAND1 storing
37852 into OPERAND0. Sequence that works without relying on DImode truncation
37853 via cvttsd2siq that is only available on 64bit targets. */
37854 void
37855 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
37857 /* C code for the stuff we expand below.
37858 double xa = fabs (x), xa2, x2;
37859 if (!isless (xa, TWO52))
37860 return x;
37861 Using the absolute value and copying back sign makes
37862 -0.0 -> -0.0 correct.
37863 xa2 = xa + TWO52 - TWO52;
37864 Compensate.
37865 dxa = xa2 - xa;
37866 if (dxa <= -0.5)
37867 xa2 += 1;
37868 else if (dxa > 0.5)
37869 xa2 -= 1;
37870 x2 = copysign (xa2, x);
37871 return x2;
37873 enum machine_mode mode = GET_MODE (operand0);
37874 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
37876 TWO52 = ix86_gen_TWO52 (mode);
37878 /* Temporary for holding the result, initialized to the input
37879 operand to ease control flow. */
37880 res = gen_reg_rtx (mode);
37881 emit_move_insn (res, operand1);
37883 /* xa = abs (operand1) */
37884 xa = ix86_expand_sse_fabs (res, &mask);
37886 /* if (!isless (xa, TWO52)) goto label; */
37887 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37889 /* xa2 = xa + TWO52 - TWO52; */
37890 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
37891 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
37893 /* dxa = xa2 - xa; */
37894 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
37896 /* generate 0.5, 1.0 and -0.5 */
37897 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
37898 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
37899 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
37900 0, OPTAB_DIRECT);
37902 /* Compensate. */
37903 tmp = gen_reg_rtx (mode);
37904 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
37905 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
37906 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37907 gen_rtx_AND (mode, one, tmp)));
37908 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
37909 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
37910 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
37911 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37912 gen_rtx_AND (mode, one, tmp)));
37913 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
37915 /* res = copysign (xa2, operand1) */
37916 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
37918 emit_label (label);
37919 LABEL_NUSES (label) = 1;
37921 emit_move_insn (operand0, res);
37924 /* Expand SSE sequence for computing trunc from OPERAND1 storing
37925 into OPERAND0. */
37926 void
37927 ix86_expand_trunc (rtx operand0, rtx operand1)
37929 /* C code for SSE variant we expand below.
37930 double xa = fabs (x), x2;
37931 if (!isless (xa, TWO52))
37932 return x;
37933 x2 = (double)(long)x;
37934 if (HONOR_SIGNED_ZEROS (mode))
37935 return copysign (x2, x);
37936 return x2;
37938 enum machine_mode mode = GET_MODE (operand0);
37939 rtx xa, xi, TWO52, label, res, mask;
37941 TWO52 = ix86_gen_TWO52 (mode);
37943 /* Temporary for holding the result, initialized to the input
37944 operand to ease control flow. */
37945 res = gen_reg_rtx (mode);
37946 emit_move_insn (res, operand1);
37948 /* xa = abs (operand1) */
37949 xa = ix86_expand_sse_fabs (res, &mask);
37951 /* if (!isless (xa, TWO52)) goto label; */
37952 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37954 /* x = (double)(long)x */
37955 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
37956 expand_fix (xi, res, 0);
37957 expand_float (res, xi, 0);
37959 if (HONOR_SIGNED_ZEROS (mode))
37960 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
37962 emit_label (label);
37963 LABEL_NUSES (label) = 1;
37965 emit_move_insn (operand0, res);
37968 /* Expand SSE sequence for computing trunc from OPERAND1 storing
37969 into OPERAND0. */
37970 void
37971 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
37973 enum machine_mode mode = GET_MODE (operand0);
37974 rtx xa, mask, TWO52, label, one, res, smask, tmp;
37976 /* C code for SSE variant we expand below.
37977 double xa = fabs (x), x2;
37978 if (!isless (xa, TWO52))
37979 return x;
37980 xa2 = xa + TWO52 - TWO52;
37981 Compensate:
37982 if (xa2 > xa)
37983 xa2 -= 1.0;
37984 x2 = copysign (xa2, x);
37985 return x2;
37988 TWO52 = ix86_gen_TWO52 (mode);
37990 /* Temporary for holding the result, initialized to the input
37991 operand to ease control flow. */
37992 res = gen_reg_rtx (mode);
37993 emit_move_insn (res, operand1);
37995 /* xa = abs (operand1) */
37996 xa = ix86_expand_sse_fabs (res, &smask);
37998 /* if (!isless (xa, TWO52)) goto label; */
37999 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38001 /* res = xa + TWO52 - TWO52; */
38002 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
38003 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
38004 emit_move_insn (res, tmp);
38006 /* generate 1.0 */
38007 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
38009 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
38010 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
38011 emit_insn (gen_rtx_SET (VOIDmode, mask,
38012 gen_rtx_AND (mode, mask, one)));
38013 tmp = expand_simple_binop (mode, MINUS,
38014 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
38015 emit_move_insn (res, tmp);
38017 /* res = copysign (res, operand1) */
38018 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
38020 emit_label (label);
38021 LABEL_NUSES (label) = 1;
38023 emit_move_insn (operand0, res);
38026 /* Expand SSE sequence for computing round from OPERAND1 storing
38027 into OPERAND0. */
38028 void
38029 ix86_expand_round (rtx operand0, rtx operand1)
38031 /* C code for the stuff we're doing below:
38032 double xa = fabs (x);
38033 if (!isless (xa, TWO52))
38034 return x;
38035 xa = (double)(long)(xa + nextafter (0.5, 0.0));
38036 return copysign (xa, x);
38038 enum machine_mode mode = GET_MODE (operand0);
38039 rtx res, TWO52, xa, label, xi, half, mask;
38040 const struct real_format *fmt;
38041 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
38043 /* Temporary for holding the result, initialized to the input
38044 operand to ease control flow. */
38045 res = gen_reg_rtx (mode);
38046 emit_move_insn (res, operand1);
38048 TWO52 = ix86_gen_TWO52 (mode);
38049 xa = ix86_expand_sse_fabs (res, &mask);
38050 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38052 /* load nextafter (0.5, 0.0) */
38053 fmt = REAL_MODE_FORMAT (mode);
38054 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
38055 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
38057 /* xa = xa + 0.5 */
38058 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
38059 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
38061 /* xa = (double)(int64_t)xa */
38062 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
38063 expand_fix (xi, xa, 0);
38064 expand_float (xa, xi, 0);
38066 /* res = copysign (xa, operand1) */
38067 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
38069 emit_label (label);
38070 LABEL_NUSES (label) = 1;
38072 emit_move_insn (operand0, res);
38075 /* Expand SSE sequence for computing round
38076 from OP1 storing into OP0 using sse4 round insn. */
38077 void
38078 ix86_expand_round_sse4 (rtx op0, rtx op1)
38080 enum machine_mode mode = GET_MODE (op0);
38081 rtx e1, e2, res, half;
38082 const struct real_format *fmt;
38083 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
38084 rtx (*gen_copysign) (rtx, rtx, rtx);
38085 rtx (*gen_round) (rtx, rtx, rtx);
38087 switch (mode)
38089 case SFmode:
38090 gen_copysign = gen_copysignsf3;
38091 gen_round = gen_sse4_1_roundsf2;
38092 break;
38093 case DFmode:
38094 gen_copysign = gen_copysigndf3;
38095 gen_round = gen_sse4_1_rounddf2;
38096 break;
38097 default:
38098 gcc_unreachable ();
38101 /* round (a) = trunc (a + copysign (0.5, a)) */
38103 /* load nextafter (0.5, 0.0) */
38104 fmt = REAL_MODE_FORMAT (mode);
38105 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
38106 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
38107 half = const_double_from_real_value (pred_half, mode);
38109 /* e1 = copysign (0.5, op1) */
38110 e1 = gen_reg_rtx (mode);
38111 emit_insn (gen_copysign (e1, half, op1));
38113 /* e2 = op1 + e1 */
38114 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
38116 /* res = trunc (e2) */
38117 res = gen_reg_rtx (mode);
38118 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
38120 emit_move_insn (op0, res);
38124 /* Table of valid machine attributes. */
38125 static const struct attribute_spec ix86_attribute_table[] =
38127 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
38128 affects_type_identity } */
38129 /* Stdcall attribute says callee is responsible for popping arguments
38130 if they are not variable. */
38131 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38132 true },
38133 /* Fastcall attribute says callee is responsible for popping arguments
38134 if they are not variable. */
38135 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38136 true },
38137 /* Thiscall attribute says callee is responsible for popping arguments
38138 if they are not variable. */
38139 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38140 true },
38141 /* Cdecl attribute says the callee is a normal C declaration */
38142 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38143 true },
38144 /* Regparm attribute specifies how many integer arguments are to be
38145 passed in registers. */
38146 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
38147 true },
38148 /* Sseregparm attribute says we are using x86_64 calling conventions
38149 for FP arguments. */
38150 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38151 true },
38152 /* The transactional memory builtins are implicitly regparm or fastcall
38153 depending on the ABI. Override the generic do-nothing attribute that
38154 these builtins were declared with. */
38155 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
38156 true },
38157 /* force_align_arg_pointer says this function realigns the stack at entry. */
38158 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
38159 false, true, true, ix86_handle_cconv_attribute, false },
38160 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
38161 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
38162 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
38163 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
38164 false },
38165 #endif
38166 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
38167 false },
38168 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
38169 false },
38170 #ifdef SUBTARGET_ATTRIBUTE_TABLE
38171 SUBTARGET_ATTRIBUTE_TABLE,
38172 #endif
38173 /* ms_abi and sysv_abi calling convention function attributes. */
38174 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
38175 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
38176 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
38177 false },
38178 { "callee_pop_aggregate_return", 1, 1, false, true, true,
38179 ix86_handle_callee_pop_aggregate_return, true },
38180 /* End element. */
38181 { NULL, 0, 0, false, false, false, NULL, false }
38184 /* Implement targetm.vectorize.builtin_vectorization_cost. */
38185 static int
38186 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
38187 tree vectype,
38188 int misalign ATTRIBUTE_UNUSED)
38190 unsigned elements;
38192 switch (type_of_cost)
38194 case scalar_stmt:
38195 return ix86_cost->scalar_stmt_cost;
38197 case scalar_load:
38198 return ix86_cost->scalar_load_cost;
38200 case scalar_store:
38201 return ix86_cost->scalar_store_cost;
38203 case vector_stmt:
38204 return ix86_cost->vec_stmt_cost;
38206 case vector_load:
38207 return ix86_cost->vec_align_load_cost;
38209 case vector_store:
38210 return ix86_cost->vec_store_cost;
38212 case vec_to_scalar:
38213 return ix86_cost->vec_to_scalar_cost;
38215 case scalar_to_vec:
38216 return ix86_cost->scalar_to_vec_cost;
38218 case unaligned_load:
38219 case unaligned_store:
38220 return ix86_cost->vec_unalign_load_cost;
38222 case cond_branch_taken:
38223 return ix86_cost->cond_taken_branch_cost;
38225 case cond_branch_not_taken:
38226 return ix86_cost->cond_not_taken_branch_cost;
38228 case vec_perm:
38229 case vec_promote_demote:
38230 return ix86_cost->vec_stmt_cost;
38232 case vec_construct:
38233 elements = TYPE_VECTOR_SUBPARTS (vectype);
38234 return elements / 2 + 1;
38236 default:
38237 gcc_unreachable ();
38241 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
38242 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
38243 insn every time. */
38245 static GTY(()) rtx vselect_insn;
38247 /* Initialize vselect_insn. */
38249 static void
38250 init_vselect_insn (void)
38252 unsigned i;
38253 rtx x;
38255 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
38256 for (i = 0; i < MAX_VECT_LEN; ++i)
38257 XVECEXP (x, 0, i) = const0_rtx;
38258 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
38259 const0_rtx), x);
38260 x = gen_rtx_SET (VOIDmode, const0_rtx, x);
38261 start_sequence ();
38262 vselect_insn = emit_insn (x);
38263 end_sequence ();
38266 /* Construct (set target (vec_select op0 (parallel perm))) and
38267 return true if that's a valid instruction in the active ISA. */
38269 static bool
38270 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
38271 unsigned nelt, bool testing_p)
38273 unsigned int i;
38274 rtx x, save_vconcat;
38275 int icode;
38277 if (vselect_insn == NULL_RTX)
38278 init_vselect_insn ();
38280 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
38281 PUT_NUM_ELEM (XVEC (x, 0), nelt);
38282 for (i = 0; i < nelt; ++i)
38283 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
38284 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
38285 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
38286 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
38287 SET_DEST (PATTERN (vselect_insn)) = target;
38288 icode = recog_memoized (vselect_insn);
38290 if (icode >= 0 && !testing_p)
38291 emit_insn (copy_rtx (PATTERN (vselect_insn)));
38293 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
38294 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
38295 INSN_CODE (vselect_insn) = -1;
38297 return icode >= 0;
38300 /* Similar, but generate a vec_concat from op0 and op1 as well. */
38302 static bool
38303 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
38304 const unsigned char *perm, unsigned nelt,
38305 bool testing_p)
38307 enum machine_mode v2mode;
38308 rtx x;
38309 bool ok;
38311 if (vselect_insn == NULL_RTX)
38312 init_vselect_insn ();
38314 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
38315 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
38316 PUT_MODE (x, v2mode);
38317 XEXP (x, 0) = op0;
38318 XEXP (x, 1) = op1;
38319 ok = expand_vselect (target, x, perm, nelt, testing_p);
38320 XEXP (x, 0) = const0_rtx;
38321 XEXP (x, 1) = const0_rtx;
38322 return ok;
38325 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
38326 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
38328 static bool
38329 expand_vec_perm_blend (struct expand_vec_perm_d *d)
38331 enum machine_mode vmode = d->vmode;
38332 unsigned i, mask, nelt = d->nelt;
38333 rtx target, op0, op1, x;
38334 rtx rperm[32], vperm;
38336 if (d->one_operand_p)
38337 return false;
38338 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
38340 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
38342 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
38344 else
38345 return false;
38347 /* This is a blend, not a permute. Elements must stay in their
38348 respective lanes. */
38349 for (i = 0; i < nelt; ++i)
38351 unsigned e = d->perm[i];
38352 if (!(e == i || e == i + nelt))
38353 return false;
38356 if (d->testing_p)
38357 return true;
38359 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
38360 decision should be extracted elsewhere, so that we only try that
38361 sequence once all budget==3 options have been tried. */
38362 target = d->target;
38363 op0 = d->op0;
38364 op1 = d->op1;
38365 mask = 0;
38367 switch (vmode)
38369 case V4DFmode:
38370 case V8SFmode:
38371 case V2DFmode:
38372 case V4SFmode:
38373 case V8HImode:
38374 case V8SImode:
38375 for (i = 0; i < nelt; ++i)
38376 mask |= (d->perm[i] >= nelt) << i;
38377 break;
38379 case V2DImode:
38380 for (i = 0; i < 2; ++i)
38381 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
38382 vmode = V8HImode;
38383 goto do_subreg;
38385 case V4SImode:
38386 for (i = 0; i < 4; ++i)
38387 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
38388 vmode = V8HImode;
38389 goto do_subreg;
38391 case V16QImode:
38392 /* See if bytes move in pairs so we can use pblendw with
38393 an immediate argument, rather than pblendvb with a vector
38394 argument. */
38395 for (i = 0; i < 16; i += 2)
38396 if (d->perm[i] + 1 != d->perm[i + 1])
38398 use_pblendvb:
38399 for (i = 0; i < nelt; ++i)
38400 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
38402 finish_pblendvb:
38403 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
38404 vperm = force_reg (vmode, vperm);
38406 if (GET_MODE_SIZE (vmode) == 16)
38407 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
38408 else
38409 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
38410 return true;
38413 for (i = 0; i < 8; ++i)
38414 mask |= (d->perm[i * 2] >= 16) << i;
38415 vmode = V8HImode;
38416 /* FALLTHRU */
38418 do_subreg:
38419 target = gen_lowpart (vmode, target);
38420 op0 = gen_lowpart (vmode, op0);
38421 op1 = gen_lowpart (vmode, op1);
38422 break;
38424 case V32QImode:
38425 /* See if bytes move in pairs. If not, vpblendvb must be used. */
38426 for (i = 0; i < 32; i += 2)
38427 if (d->perm[i] + 1 != d->perm[i + 1])
38428 goto use_pblendvb;
38429 /* See if bytes move in quadruplets. If yes, vpblendd
38430 with immediate can be used. */
38431 for (i = 0; i < 32; i += 4)
38432 if (d->perm[i] + 2 != d->perm[i + 2])
38433 break;
38434 if (i < 32)
38436 /* See if bytes move the same in both lanes. If yes,
38437 vpblendw with immediate can be used. */
38438 for (i = 0; i < 16; i += 2)
38439 if (d->perm[i] + 16 != d->perm[i + 16])
38440 goto use_pblendvb;
38442 /* Use vpblendw. */
38443 for (i = 0; i < 16; ++i)
38444 mask |= (d->perm[i * 2] >= 32) << i;
38445 vmode = V16HImode;
38446 goto do_subreg;
38449 /* Use vpblendd. */
38450 for (i = 0; i < 8; ++i)
38451 mask |= (d->perm[i * 4] >= 32) << i;
38452 vmode = V8SImode;
38453 goto do_subreg;
38455 case V16HImode:
38456 /* See if words move in pairs. If yes, vpblendd can be used. */
38457 for (i = 0; i < 16; i += 2)
38458 if (d->perm[i] + 1 != d->perm[i + 1])
38459 break;
38460 if (i < 16)
38462 /* See if words move the same in both lanes. If not,
38463 vpblendvb must be used. */
38464 for (i = 0; i < 8; i++)
38465 if (d->perm[i] + 8 != d->perm[i + 8])
38467 /* Use vpblendvb. */
38468 for (i = 0; i < 32; ++i)
38469 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
38471 vmode = V32QImode;
38472 nelt = 32;
38473 target = gen_lowpart (vmode, target);
38474 op0 = gen_lowpart (vmode, op0);
38475 op1 = gen_lowpart (vmode, op1);
38476 goto finish_pblendvb;
38479 /* Use vpblendw. */
38480 for (i = 0; i < 16; ++i)
38481 mask |= (d->perm[i] >= 16) << i;
38482 break;
38485 /* Use vpblendd. */
38486 for (i = 0; i < 8; ++i)
38487 mask |= (d->perm[i * 2] >= 16) << i;
38488 vmode = V8SImode;
38489 goto do_subreg;
38491 case V4DImode:
38492 /* Use vpblendd. */
38493 for (i = 0; i < 4; ++i)
38494 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
38495 vmode = V8SImode;
38496 goto do_subreg;
38498 default:
38499 gcc_unreachable ();
38502 /* This matches five different patterns with the different modes. */
38503 x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
38504 x = gen_rtx_SET (VOIDmode, target, x);
38505 emit_insn (x);
38507 return true;
38510 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
38511 in terms of the variable form of vpermilps.
38513 Note that we will have already failed the immediate input vpermilps,
38514 which requires that the high and low part shuffle be identical; the
38515 variable form doesn't require that. */
38517 static bool
38518 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
38520 rtx rperm[8], vperm;
38521 unsigned i;
38523 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
38524 return false;
38526 /* We can only permute within the 128-bit lane. */
38527 for (i = 0; i < 8; ++i)
38529 unsigned e = d->perm[i];
38530 if (i < 4 ? e >= 4 : e < 4)
38531 return false;
38534 if (d->testing_p)
38535 return true;
38537 for (i = 0; i < 8; ++i)
38539 unsigned e = d->perm[i];
38541 /* Within each 128-bit lane, the elements of op0 are numbered
38542 from 0 and the elements of op1 are numbered from 4. */
38543 if (e >= 8 + 4)
38544 e -= 8;
38545 else if (e >= 4)
38546 e -= 4;
38548 rperm[i] = GEN_INT (e);
38551 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
38552 vperm = force_reg (V8SImode, vperm);
38553 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
38555 return true;
38558 /* Return true if permutation D can be performed as VMODE permutation
38559 instead. */
38561 static bool
38562 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
38564 unsigned int i, j, chunk;
38566 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
38567 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
38568 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
38569 return false;
38571 if (GET_MODE_NUNITS (vmode) >= d->nelt)
38572 return true;
38574 chunk = d->nelt / GET_MODE_NUNITS (vmode);
38575 for (i = 0; i < d->nelt; i += chunk)
38576 if (d->perm[i] & (chunk - 1))
38577 return false;
38578 else
38579 for (j = 1; j < chunk; ++j)
38580 if (d->perm[i] + j != d->perm[i + j])
38581 return false;
38583 return true;
38586 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
38587 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
38589 static bool
38590 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
38592 unsigned i, nelt, eltsz, mask;
38593 unsigned char perm[32];
38594 enum machine_mode vmode = V16QImode;
38595 rtx rperm[32], vperm, target, op0, op1;
38597 nelt = d->nelt;
38599 if (!d->one_operand_p)
38601 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
38603 if (TARGET_AVX2
38604 && valid_perm_using_mode_p (V2TImode, d))
38606 if (d->testing_p)
38607 return true;
38609 /* Use vperm2i128 insn. The pattern uses
38610 V4DImode instead of V2TImode. */
38611 target = gen_lowpart (V4DImode, d->target);
38612 op0 = gen_lowpart (V4DImode, d->op0);
38613 op1 = gen_lowpart (V4DImode, d->op1);
38614 rperm[0]
38615 = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
38616 || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
38617 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
38618 return true;
38620 return false;
38623 else
38625 if (GET_MODE_SIZE (d->vmode) == 16)
38627 if (!TARGET_SSSE3)
38628 return false;
38630 else if (GET_MODE_SIZE (d->vmode) == 32)
38632 if (!TARGET_AVX2)
38633 return false;
38635 /* V4DImode should be already handled through
38636 expand_vselect by vpermq instruction. */
38637 gcc_assert (d->vmode != V4DImode);
38639 vmode = V32QImode;
38640 if (d->vmode == V8SImode
38641 || d->vmode == V16HImode
38642 || d->vmode == V32QImode)
38644 /* First see if vpermq can be used for
38645 V8SImode/V16HImode/V32QImode. */
38646 if (valid_perm_using_mode_p (V4DImode, d))
38648 for (i = 0; i < 4; i++)
38649 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
38650 if (d->testing_p)
38651 return true;
38652 return expand_vselect (gen_lowpart (V4DImode, d->target),
38653 gen_lowpart (V4DImode, d->op0),
38654 perm, 4, false);
38657 /* Next see if vpermd can be used. */
38658 if (valid_perm_using_mode_p (V8SImode, d))
38659 vmode = V8SImode;
38661 /* Or if vpermps can be used. */
38662 else if (d->vmode == V8SFmode)
38663 vmode = V8SImode;
38665 if (vmode == V32QImode)
38667 /* vpshufb only works intra lanes, it is not
38668 possible to shuffle bytes in between the lanes. */
38669 for (i = 0; i < nelt; ++i)
38670 if ((d->perm[i] ^ i) & (nelt / 2))
38671 return false;
38674 else
38675 return false;
38678 if (d->testing_p)
38679 return true;
38681 if (vmode == V8SImode)
38682 for (i = 0; i < 8; ++i)
38683 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
38684 else
38686 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
38687 if (!d->one_operand_p)
38688 mask = 2 * nelt - 1;
38689 else if (vmode == V16QImode)
38690 mask = nelt - 1;
38691 else
38692 mask = nelt / 2 - 1;
38694 for (i = 0; i < nelt; ++i)
38696 unsigned j, e = d->perm[i] & mask;
38697 for (j = 0; j < eltsz; ++j)
38698 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
38702 vperm = gen_rtx_CONST_VECTOR (vmode,
38703 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
38704 vperm = force_reg (vmode, vperm);
38706 target = gen_lowpart (vmode, d->target);
38707 op0 = gen_lowpart (vmode, d->op0);
38708 if (d->one_operand_p)
38710 if (vmode == V16QImode)
38711 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
38712 else if (vmode == V32QImode)
38713 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
38714 else if (vmode == V8SFmode)
38715 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
38716 else
38717 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
38719 else
38721 op1 = gen_lowpart (vmode, d->op1);
38722 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
38725 return true;
38728 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
38729 in a single instruction. */
38731 static bool
38732 expand_vec_perm_1 (struct expand_vec_perm_d *d)
38734 unsigned i, nelt = d->nelt;
38735 unsigned char perm2[MAX_VECT_LEN];
38737 /* Check plain VEC_SELECT first, because AVX has instructions that could
38738 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
38739 input where SEL+CONCAT may not. */
38740 if (d->one_operand_p)
38742 int mask = nelt - 1;
38743 bool identity_perm = true;
38744 bool broadcast_perm = true;
38746 for (i = 0; i < nelt; i++)
38748 perm2[i] = d->perm[i] & mask;
38749 if (perm2[i] != i)
38750 identity_perm = false;
38751 if (perm2[i])
38752 broadcast_perm = false;
38755 if (identity_perm)
38757 if (!d->testing_p)
38758 emit_move_insn (d->target, d->op0);
38759 return true;
38761 else if (broadcast_perm && TARGET_AVX2)
38763 /* Use vpbroadcast{b,w,d}. */
38764 rtx (*gen) (rtx, rtx) = NULL;
38765 switch (d->vmode)
38767 case V32QImode:
38768 gen = gen_avx2_pbroadcastv32qi_1;
38769 break;
38770 case V16HImode:
38771 gen = gen_avx2_pbroadcastv16hi_1;
38772 break;
38773 case V8SImode:
38774 gen = gen_avx2_pbroadcastv8si_1;
38775 break;
38776 case V16QImode:
38777 gen = gen_avx2_pbroadcastv16qi;
38778 break;
38779 case V8HImode:
38780 gen = gen_avx2_pbroadcastv8hi;
38781 break;
38782 case V8SFmode:
38783 gen = gen_avx2_vec_dupv8sf_1;
38784 break;
38785 /* For other modes prefer other shuffles this function creates. */
38786 default: break;
38788 if (gen != NULL)
38790 if (!d->testing_p)
38791 emit_insn (gen (d->target, d->op0));
38792 return true;
38796 if (expand_vselect (d->target, d->op0, perm2, nelt, d->testing_p))
38797 return true;
38799 /* There are plenty of patterns in sse.md that are written for
38800 SEL+CONCAT and are not replicated for a single op. Perhaps
38801 that should be changed, to avoid the nastiness here. */
38803 /* Recognize interleave style patterns, which means incrementing
38804 every other permutation operand. */
38805 for (i = 0; i < nelt; i += 2)
38807 perm2[i] = d->perm[i] & mask;
38808 perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
38810 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
38811 d->testing_p))
38812 return true;
38814 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
38815 if (nelt >= 4)
38817 for (i = 0; i < nelt; i += 4)
38819 perm2[i + 0] = d->perm[i + 0] & mask;
38820 perm2[i + 1] = d->perm[i + 1] & mask;
38821 perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
38822 perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
38825 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
38826 d->testing_p))
38827 return true;
38831 /* Finally, try the fully general two operand permute. */
38832 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
38833 d->testing_p))
38834 return true;
38836 /* Recognize interleave style patterns with reversed operands. */
38837 if (!d->one_operand_p)
38839 for (i = 0; i < nelt; ++i)
38841 unsigned e = d->perm[i];
38842 if (e >= nelt)
38843 e -= nelt;
38844 else
38845 e += nelt;
38846 perm2[i] = e;
38849 if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt,
38850 d->testing_p))
38851 return true;
38854 /* Try the SSE4.1 blend variable merge instructions. */
38855 if (expand_vec_perm_blend (d))
38856 return true;
38858 /* Try one of the AVX vpermil variable permutations. */
38859 if (expand_vec_perm_vpermil (d))
38860 return true;
38862 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
38863 vpshufb, vpermd, vpermps or vpermq variable permutation. */
38864 if (expand_vec_perm_pshufb (d))
38865 return true;
38867 return false;
38870 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
38871 in terms of a pair of pshuflw + pshufhw instructions. */
38873 static bool
38874 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
38876 unsigned char perm2[MAX_VECT_LEN];
38877 unsigned i;
38878 bool ok;
38880 if (d->vmode != V8HImode || !d->one_operand_p)
38881 return false;
38883 /* The two permutations only operate in 64-bit lanes. */
38884 for (i = 0; i < 4; ++i)
38885 if (d->perm[i] >= 4)
38886 return false;
38887 for (i = 4; i < 8; ++i)
38888 if (d->perm[i] < 4)
38889 return false;
38891 if (d->testing_p)
38892 return true;
38894 /* Emit the pshuflw. */
38895 memcpy (perm2, d->perm, 4);
38896 for (i = 4; i < 8; ++i)
38897 perm2[i] = i;
38898 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
38899 gcc_assert (ok);
38901 /* Emit the pshufhw. */
38902 memcpy (perm2 + 4, d->perm + 4, 4);
38903 for (i = 0; i < 4; ++i)
38904 perm2[i] = i;
38905 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
38906 gcc_assert (ok);
38908 return true;
38911 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
38912 the permutation using the SSSE3 palignr instruction. This succeeds
38913 when all of the elements in PERM fit within one vector and we merely
38914 need to shift them down so that a single vector permutation has a
38915 chance to succeed. */
38917 static bool
38918 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
38920 unsigned i, nelt = d->nelt;
38921 unsigned min, max;
38922 bool in_order, ok;
38923 rtx shift;
38925 /* Even with AVX, palignr only operates on 128-bit vectors. */
38926 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
38927 return false;
38929 min = nelt, max = 0;
38930 for (i = 0; i < nelt; ++i)
38932 unsigned e = d->perm[i];
38933 if (e < min)
38934 min = e;
38935 if (e > max)
38936 max = e;
38938 if (min == 0 || max - min >= nelt)
38939 return false;
38941 /* Given that we have SSSE3, we know we'll be able to implement the
38942 single operand permutation after the palignr with pshufb. */
38943 if (d->testing_p)
38944 return true;
38946 shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
38947 emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target),
38948 gen_lowpart (TImode, d->op1),
38949 gen_lowpart (TImode, d->op0), shift));
38951 d->op0 = d->op1 = d->target;
38952 d->one_operand_p = true;
38954 in_order = true;
38955 for (i = 0; i < nelt; ++i)
38957 unsigned e = d->perm[i] - min;
38958 if (e != i)
38959 in_order = false;
38960 d->perm[i] = e;
38963 /* Test for the degenerate case where the alignment by itself
38964 produces the desired permutation. */
38965 if (in_order)
38966 return true;
38968 ok = expand_vec_perm_1 (d);
38969 gcc_assert (ok);
38971 return ok;
38974 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
38976 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
38977 a two vector permutation into a single vector permutation by using
38978 an interleave operation to merge the vectors. */
38980 static bool
38981 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
38983 struct expand_vec_perm_d dremap, dfinal;
38984 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
38985 unsigned HOST_WIDE_INT contents;
38986 unsigned char remap[2 * MAX_VECT_LEN];
38987 rtx seq;
38988 bool ok, same_halves = false;
38990 if (GET_MODE_SIZE (d->vmode) == 16)
38992 if (d->one_operand_p)
38993 return false;
38995 else if (GET_MODE_SIZE (d->vmode) == 32)
38997 if (!TARGET_AVX)
38998 return false;
38999 /* For 32-byte modes allow even d->one_operand_p.
39000 The lack of cross-lane shuffling in some instructions
39001 might prevent a single insn shuffle. */
39002 dfinal = *d;
39003 dfinal.testing_p = true;
39004 /* If expand_vec_perm_interleave3 can expand this into
39005 a 3 insn sequence, give up and let it be expanded as
39006 3 insn sequence. While that is one insn longer,
39007 it doesn't need a memory operand and in the common
39008 case that both interleave low and high permutations
39009 with the same operands are adjacent needs 4 insns
39010 for both after CSE. */
39011 if (expand_vec_perm_interleave3 (&dfinal))
39012 return false;
39014 else
39015 return false;
39017 /* Examine from whence the elements come. */
39018 contents = 0;
39019 for (i = 0; i < nelt; ++i)
39020 contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
39022 memset (remap, 0xff, sizeof (remap));
39023 dremap = *d;
39025 if (GET_MODE_SIZE (d->vmode) == 16)
39027 unsigned HOST_WIDE_INT h1, h2, h3, h4;
39029 /* Split the two input vectors into 4 halves. */
39030 h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
39031 h2 = h1 << nelt2;
39032 h3 = h2 << nelt2;
39033 h4 = h3 << nelt2;
39035 /* If the elements from the low halves use interleave low, and similarly
39036 for interleave high. If the elements are from mis-matched halves, we
39037 can use shufps for V4SF/V4SI or do a DImode shuffle. */
39038 if ((contents & (h1 | h3)) == contents)
39040 /* punpckl* */
39041 for (i = 0; i < nelt2; ++i)
39043 remap[i] = i * 2;
39044 remap[i + nelt] = i * 2 + 1;
39045 dremap.perm[i * 2] = i;
39046 dremap.perm[i * 2 + 1] = i + nelt;
39048 if (!TARGET_SSE2 && d->vmode == V4SImode)
39049 dremap.vmode = V4SFmode;
39051 else if ((contents & (h2 | h4)) == contents)
39053 /* punpckh* */
39054 for (i = 0; i < nelt2; ++i)
39056 remap[i + nelt2] = i * 2;
39057 remap[i + nelt + nelt2] = i * 2 + 1;
39058 dremap.perm[i * 2] = i + nelt2;
39059 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
39061 if (!TARGET_SSE2 && d->vmode == V4SImode)
39062 dremap.vmode = V4SFmode;
39064 else if ((contents & (h1 | h4)) == contents)
39066 /* shufps */
39067 for (i = 0; i < nelt2; ++i)
39069 remap[i] = i;
39070 remap[i + nelt + nelt2] = i + nelt2;
39071 dremap.perm[i] = i;
39072 dremap.perm[i + nelt2] = i + nelt + nelt2;
39074 if (nelt != 4)
39076 /* shufpd */
39077 dremap.vmode = V2DImode;
39078 dremap.nelt = 2;
39079 dremap.perm[0] = 0;
39080 dremap.perm[1] = 3;
39083 else if ((contents & (h2 | h3)) == contents)
39085 /* shufps */
39086 for (i = 0; i < nelt2; ++i)
39088 remap[i + nelt2] = i;
39089 remap[i + nelt] = i + nelt2;
39090 dremap.perm[i] = i + nelt2;
39091 dremap.perm[i + nelt2] = i + nelt;
39093 if (nelt != 4)
39095 /* shufpd */
39096 dremap.vmode = V2DImode;
39097 dremap.nelt = 2;
39098 dremap.perm[0] = 1;
39099 dremap.perm[1] = 2;
39102 else
39103 return false;
39105 else
39107 unsigned int nelt4 = nelt / 4, nzcnt = 0;
39108 unsigned HOST_WIDE_INT q[8];
39109 unsigned int nonzero_halves[4];
39111 /* Split the two input vectors into 8 quarters. */
39112 q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
39113 for (i = 1; i < 8; ++i)
39114 q[i] = q[0] << (nelt4 * i);
39115 for (i = 0; i < 4; ++i)
39116 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
39118 nonzero_halves[nzcnt] = i;
39119 ++nzcnt;
39122 if (nzcnt == 1)
39124 gcc_assert (d->one_operand_p);
39125 nonzero_halves[1] = nonzero_halves[0];
39126 same_halves = true;
39128 else if (d->one_operand_p)
39130 gcc_assert (nonzero_halves[0] == 0);
39131 gcc_assert (nonzero_halves[1] == 1);
39134 if (nzcnt <= 2)
39136 if (d->perm[0] / nelt2 == nonzero_halves[1])
39138 /* Attempt to increase the likelihood that dfinal
39139 shuffle will be intra-lane. */
39140 char tmph = nonzero_halves[0];
39141 nonzero_halves[0] = nonzero_halves[1];
39142 nonzero_halves[1] = tmph;
39145 /* vperm2f128 or vperm2i128. */
39146 for (i = 0; i < nelt2; ++i)
39148 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
39149 remap[i + nonzero_halves[0] * nelt2] = i;
39150 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
39151 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
39154 if (d->vmode != V8SFmode
39155 && d->vmode != V4DFmode
39156 && d->vmode != V8SImode)
39158 dremap.vmode = V8SImode;
39159 dremap.nelt = 8;
39160 for (i = 0; i < 4; ++i)
39162 dremap.perm[i] = i + nonzero_halves[0] * 4;
39163 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
39167 else if (d->one_operand_p)
39168 return false;
39169 else if (TARGET_AVX2
39170 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
39172 /* vpunpckl* */
39173 for (i = 0; i < nelt4; ++i)
39175 remap[i] = i * 2;
39176 remap[i + nelt] = i * 2 + 1;
39177 remap[i + nelt2] = i * 2 + nelt2;
39178 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
39179 dremap.perm[i * 2] = i;
39180 dremap.perm[i * 2 + 1] = i + nelt;
39181 dremap.perm[i * 2 + nelt2] = i + nelt2;
39182 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
39185 else if (TARGET_AVX2
39186 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
39188 /* vpunpckh* */
39189 for (i = 0; i < nelt4; ++i)
39191 remap[i + nelt4] = i * 2;
39192 remap[i + nelt + nelt4] = i * 2 + 1;
39193 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
39194 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
39195 dremap.perm[i * 2] = i + nelt4;
39196 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
39197 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
39198 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
39201 else
39202 return false;
39205 /* Use the remapping array set up above to move the elements from their
39206 swizzled locations into their final destinations. */
39207 dfinal = *d;
39208 for (i = 0; i < nelt; ++i)
39210 unsigned e = remap[d->perm[i]];
39211 gcc_assert (e < nelt);
39212 /* If same_halves is true, both halves of the remapped vector are the
39213 same. Avoid cross-lane accesses if possible. */
39214 if (same_halves && i >= nelt2)
39216 gcc_assert (e < nelt2);
39217 dfinal.perm[i] = e + nelt2;
39219 else
39220 dfinal.perm[i] = e;
39222 dfinal.op0 = gen_reg_rtx (dfinal.vmode);
39223 dfinal.op1 = dfinal.op0;
39224 dfinal.one_operand_p = true;
39225 dremap.target = dfinal.op0;
39227 /* Test if the final remap can be done with a single insn. For V4SFmode or
39228 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
39229 start_sequence ();
39230 ok = expand_vec_perm_1 (&dfinal);
39231 seq = get_insns ();
39232 end_sequence ();
39234 if (!ok)
39235 return false;
39237 if (d->testing_p)
39238 return true;
39240 if (dremap.vmode != dfinal.vmode)
39242 dremap.target = gen_lowpart (dremap.vmode, dremap.target);
39243 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
39244 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
39247 ok = expand_vec_perm_1 (&dremap);
39248 gcc_assert (ok);
39250 emit_insn (seq);
39251 return true;
39254 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
39255 a single vector cross-lane permutation into vpermq followed
39256 by any of the single insn permutations. */
39258 static bool
39259 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
39261 struct expand_vec_perm_d dremap, dfinal;
39262 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
39263 unsigned contents[2];
39264 bool ok;
39266 if (!(TARGET_AVX2
39267 && (d->vmode == V32QImode || d->vmode == V16HImode)
39268 && d->one_operand_p))
39269 return false;
39271 contents[0] = 0;
39272 contents[1] = 0;
39273 for (i = 0; i < nelt2; ++i)
39275 contents[0] |= 1u << (d->perm[i] / nelt4);
39276 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
39279 for (i = 0; i < 2; ++i)
39281 unsigned int cnt = 0;
39282 for (j = 0; j < 4; ++j)
39283 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
39284 return false;
39287 if (d->testing_p)
39288 return true;
39290 dremap = *d;
39291 dremap.vmode = V4DImode;
39292 dremap.nelt = 4;
39293 dremap.target = gen_reg_rtx (V4DImode);
39294 dremap.op0 = gen_lowpart (V4DImode, d->op0);
39295 dremap.op1 = dremap.op0;
39296 dremap.one_operand_p = true;
39297 for (i = 0; i < 2; ++i)
39299 unsigned int cnt = 0;
39300 for (j = 0; j < 4; ++j)
39301 if ((contents[i] & (1u << j)) != 0)
39302 dremap.perm[2 * i + cnt++] = j;
39303 for (; cnt < 2; ++cnt)
39304 dremap.perm[2 * i + cnt] = 0;
39307 dfinal = *d;
39308 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
39309 dfinal.op1 = dfinal.op0;
39310 dfinal.one_operand_p = true;
39311 for (i = 0, j = 0; i < nelt; ++i)
39313 if (i == nelt2)
39314 j = 2;
39315 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
39316 if ((d->perm[i] / nelt4) == dremap.perm[j])
39318 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
39319 dfinal.perm[i] |= nelt4;
39320 else
39321 gcc_unreachable ();
39324 ok = expand_vec_perm_1 (&dremap);
39325 gcc_assert (ok);
39327 ok = expand_vec_perm_1 (&dfinal);
39328 gcc_assert (ok);
39330 return true;
39333 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
39334 a vector permutation using two instructions, vperm2f128 resp.
39335 vperm2i128 followed by any single in-lane permutation. */
39337 static bool
39338 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
39340 struct expand_vec_perm_d dfirst, dsecond;
39341 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
39342 bool ok;
39344 if (!TARGET_AVX
39345 || GET_MODE_SIZE (d->vmode) != 32
39346 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
39347 return false;
39349 dsecond = *d;
39350 dsecond.one_operand_p = false;
39351 dsecond.testing_p = true;
39353 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
39354 immediate. For perm < 16 the second permutation uses
39355 d->op0 as first operand, for perm >= 16 it uses d->op1
39356 as first operand. The second operand is the result of
39357 vperm2[fi]128. */
39358 for (perm = 0; perm < 32; perm++)
39360 /* Ignore permutations which do not move anything cross-lane. */
39361 if (perm < 16)
39363 /* The second shuffle for e.g. V4DFmode has
39364 0123 and ABCD operands.
39365 Ignore AB23, as 23 is already in the second lane
39366 of the first operand. */
39367 if ((perm & 0xc) == (1 << 2)) continue;
39368 /* And 01CD, as 01 is in the first lane of the first
39369 operand. */
39370 if ((perm & 3) == 0) continue;
39371 /* And 4567, as then the vperm2[fi]128 doesn't change
39372 anything on the original 4567 second operand. */
39373 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
39375 else
39377 /* The second shuffle for e.g. V4DFmode has
39378 4567 and ABCD operands.
39379 Ignore AB67, as 67 is already in the second lane
39380 of the first operand. */
39381 if ((perm & 0xc) == (3 << 2)) continue;
39382 /* And 45CD, as 45 is in the first lane of the first
39383 operand. */
39384 if ((perm & 3) == 2) continue;
39385 /* And 0123, as then the vperm2[fi]128 doesn't change
39386 anything on the original 0123 first operand. */
39387 if ((perm & 0xf) == (1 << 2)) continue;
39390 for (i = 0; i < nelt; i++)
39392 j = d->perm[i] / nelt2;
39393 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
39394 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
39395 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
39396 dsecond.perm[i] = d->perm[i] & (nelt - 1);
39397 else
39398 break;
39401 if (i == nelt)
39403 start_sequence ();
39404 ok = expand_vec_perm_1 (&dsecond);
39405 end_sequence ();
39407 else
39408 ok = false;
39410 if (ok)
39412 if (d->testing_p)
39413 return true;
39415 /* Found a usable second shuffle. dfirst will be
39416 vperm2f128 on d->op0 and d->op1. */
39417 dsecond.testing_p = false;
39418 dfirst = *d;
39419 dfirst.target = gen_reg_rtx (d->vmode);
39420 for (i = 0; i < nelt; i++)
39421 dfirst.perm[i] = (i & (nelt2 - 1))
39422 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
39424 ok = expand_vec_perm_1 (&dfirst);
39425 gcc_assert (ok);
39427 /* And dsecond is some single insn shuffle, taking
39428 d->op0 and result of vperm2f128 (if perm < 16) or
39429 d->op1 and result of vperm2f128 (otherwise). */
39430 dsecond.op1 = dfirst.target;
39431 if (perm >= 16)
39432 dsecond.op0 = dfirst.op1;
39434 ok = expand_vec_perm_1 (&dsecond);
39435 gcc_assert (ok);
39437 return true;
39440 /* For one operand, the only useful vperm2f128 permutation is 0x10. */
39441 if (d->one_operand_p)
39442 return false;
39445 return false;
39448 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
39449 a two vector permutation using 2 intra-lane interleave insns
39450 and cross-lane shuffle for 32-byte vectors. */
39452 static bool
39453 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
39455 unsigned i, nelt;
39456 rtx (*gen) (rtx, rtx, rtx);
39458 if (d->one_operand_p)
39459 return false;
39460 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
39462 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
39464 else
39465 return false;
39467 nelt = d->nelt;
39468 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
39469 return false;
39470 for (i = 0; i < nelt; i += 2)
39471 if (d->perm[i] != d->perm[0] + i / 2
39472 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
39473 return false;
39475 if (d->testing_p)
39476 return true;
39478 switch (d->vmode)
39480 case V32QImode:
39481 if (d->perm[0])
39482 gen = gen_vec_interleave_highv32qi;
39483 else
39484 gen = gen_vec_interleave_lowv32qi;
39485 break;
39486 case V16HImode:
39487 if (d->perm[0])
39488 gen = gen_vec_interleave_highv16hi;
39489 else
39490 gen = gen_vec_interleave_lowv16hi;
39491 break;
39492 case V8SImode:
39493 if (d->perm[0])
39494 gen = gen_vec_interleave_highv8si;
39495 else
39496 gen = gen_vec_interleave_lowv8si;
39497 break;
39498 case V4DImode:
39499 if (d->perm[0])
39500 gen = gen_vec_interleave_highv4di;
39501 else
39502 gen = gen_vec_interleave_lowv4di;
39503 break;
39504 case V8SFmode:
39505 if (d->perm[0])
39506 gen = gen_vec_interleave_highv8sf;
39507 else
39508 gen = gen_vec_interleave_lowv8sf;
39509 break;
39510 case V4DFmode:
39511 if (d->perm[0])
39512 gen = gen_vec_interleave_highv4df;
39513 else
39514 gen = gen_vec_interleave_lowv4df;
39515 break;
39516 default:
39517 gcc_unreachable ();
39520 emit_insn (gen (d->target, d->op0, d->op1));
39521 return true;
39524 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
39525 a single vector permutation using a single intra-lane vector
39526 permutation, vperm2f128 swapping the lanes and vblend* insn blending
39527 the non-swapped and swapped vectors together. */
39529 static bool
39530 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
39532 struct expand_vec_perm_d dfirst, dsecond;
39533 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
39534 rtx seq;
39535 bool ok;
39536 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
39538 if (!TARGET_AVX
39539 || TARGET_AVX2
39540 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
39541 || !d->one_operand_p)
39542 return false;
39544 dfirst = *d;
39545 for (i = 0; i < nelt; i++)
39546 dfirst.perm[i] = 0xff;
39547 for (i = 0, msk = 0; i < nelt; i++)
39549 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
39550 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
39551 return false;
39552 dfirst.perm[j] = d->perm[i];
39553 if (j != i)
39554 msk |= (1 << i);
39556 for (i = 0; i < nelt; i++)
39557 if (dfirst.perm[i] == 0xff)
39558 dfirst.perm[i] = i;
39560 if (!d->testing_p)
39561 dfirst.target = gen_reg_rtx (dfirst.vmode);
39563 start_sequence ();
39564 ok = expand_vec_perm_1 (&dfirst);
39565 seq = get_insns ();
39566 end_sequence ();
39568 if (!ok)
39569 return false;
39571 if (d->testing_p)
39572 return true;
39574 emit_insn (seq);
39576 dsecond = *d;
39577 dsecond.op0 = dfirst.target;
39578 dsecond.op1 = dfirst.target;
39579 dsecond.one_operand_p = true;
39580 dsecond.target = gen_reg_rtx (dsecond.vmode);
39581 for (i = 0; i < nelt; i++)
39582 dsecond.perm[i] = i ^ nelt2;
39584 ok = expand_vec_perm_1 (&dsecond);
39585 gcc_assert (ok);
39587 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
39588 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
39589 return true;
39592 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
39593 permutation using two vperm2f128, followed by a vshufpd insn blending
39594 the two vectors together. */
39596 static bool
39597 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
39599 struct expand_vec_perm_d dfirst, dsecond, dthird;
39600 bool ok;
39602 if (!TARGET_AVX || (d->vmode != V4DFmode))
39603 return false;
39605 if (d->testing_p)
39606 return true;
39608 dfirst = *d;
39609 dsecond = *d;
39610 dthird = *d;
39612 dfirst.perm[0] = (d->perm[0] & ~1);
39613 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
39614 dfirst.perm[2] = (d->perm[2] & ~1);
39615 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
39616 dsecond.perm[0] = (d->perm[1] & ~1);
39617 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
39618 dsecond.perm[2] = (d->perm[3] & ~1);
39619 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
39620 dthird.perm[0] = (d->perm[0] % 2);
39621 dthird.perm[1] = (d->perm[1] % 2) + 4;
39622 dthird.perm[2] = (d->perm[2] % 2) + 2;
39623 dthird.perm[3] = (d->perm[3] % 2) + 6;
39625 dfirst.target = gen_reg_rtx (dfirst.vmode);
39626 dsecond.target = gen_reg_rtx (dsecond.vmode);
39627 dthird.op0 = dfirst.target;
39628 dthird.op1 = dsecond.target;
39629 dthird.one_operand_p = false;
39631 canonicalize_perm (&dfirst);
39632 canonicalize_perm (&dsecond);
39634 ok = expand_vec_perm_1 (&dfirst)
39635 && expand_vec_perm_1 (&dsecond)
39636 && expand_vec_perm_1 (&dthird);
39638 gcc_assert (ok);
39640 return true;
39643 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
39644 permutation with two pshufb insns and an ior. We should have already
39645 failed all two instruction sequences. */
39647 static bool
39648 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
39650 rtx rperm[2][16], vperm, l, h, op, m128;
39651 unsigned int i, nelt, eltsz;
39653 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
39654 return false;
39655 gcc_assert (!d->one_operand_p);
39657 nelt = d->nelt;
39658 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
39660 /* Generate two permutation masks. If the required element is within
39661 the given vector it is shuffled into the proper lane. If the required
39662 element is in the other vector, force a zero into the lane by setting
39663 bit 7 in the permutation mask. */
39664 m128 = GEN_INT (-128);
39665 for (i = 0; i < nelt; ++i)
39667 unsigned j, e = d->perm[i];
39668 unsigned which = (e >= nelt);
39669 if (e >= nelt)
39670 e -= nelt;
39672 for (j = 0; j < eltsz; ++j)
39674 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
39675 rperm[1-which][i*eltsz + j] = m128;
39679 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
39680 vperm = force_reg (V16QImode, vperm);
39682 l = gen_reg_rtx (V16QImode);
39683 op = gen_lowpart (V16QImode, d->op0);
39684 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
39686 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
39687 vperm = force_reg (V16QImode, vperm);
39689 h = gen_reg_rtx (V16QImode);
39690 op = gen_lowpart (V16QImode, d->op1);
39691 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
39693 op = gen_lowpart (V16QImode, d->target);
39694 emit_insn (gen_iorv16qi3 (op, l, h));
39696 return true;
39699 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
39700 with two vpshufb insns, vpermq and vpor. We should have already failed
39701 all two or three instruction sequences. */
39703 static bool
39704 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
39706 rtx rperm[2][32], vperm, l, h, hp, op, m128;
39707 unsigned int i, nelt, eltsz;
39709 if (!TARGET_AVX2
39710 || !d->one_operand_p
39711 || (d->vmode != V32QImode && d->vmode != V16HImode))
39712 return false;
39714 if (d->testing_p)
39715 return true;
39717 nelt = d->nelt;
39718 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
39720 /* Generate two permutation masks. If the required element is within
39721 the same lane, it is shuffled in. If the required element from the
39722 other lane, force a zero by setting bit 7 in the permutation mask.
39723 In the other mask the mask has non-negative elements if element
39724 is requested from the other lane, but also moved to the other lane,
39725 so that the result of vpshufb can have the two V2TImode halves
39726 swapped. */
39727 m128 = GEN_INT (-128);
39728 for (i = 0; i < nelt; ++i)
39730 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
39731 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
39733 for (j = 0; j < eltsz; ++j)
39735 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
39736 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
39740 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
39741 vperm = force_reg (V32QImode, vperm);
39743 h = gen_reg_rtx (V32QImode);
39744 op = gen_lowpart (V32QImode, d->op0);
39745 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
39747 /* Swap the 128-byte lanes of h into hp. */
39748 hp = gen_reg_rtx (V4DImode);
39749 op = gen_lowpart (V4DImode, h);
39750 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
39751 const1_rtx));
39753 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
39754 vperm = force_reg (V32QImode, vperm);
39756 l = gen_reg_rtx (V32QImode);
39757 op = gen_lowpart (V32QImode, d->op0);
39758 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
39760 op = gen_lowpart (V32QImode, d->target);
39761 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
39763 return true;
39766 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
39767 and extract-odd permutations of two V32QImode and V16QImode operand
39768 with two vpshufb insns, vpor and vpermq. We should have already
39769 failed all two or three instruction sequences. */
39771 static bool
39772 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
39774 rtx rperm[2][32], vperm, l, h, ior, op, m128;
39775 unsigned int i, nelt, eltsz;
39777 if (!TARGET_AVX2
39778 || d->one_operand_p
39779 || (d->vmode != V32QImode && d->vmode != V16HImode))
39780 return false;
39782 for (i = 0; i < d->nelt; ++i)
39783 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
39784 return false;
39786 if (d->testing_p)
39787 return true;
39789 nelt = d->nelt;
39790 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
39792 /* Generate two permutation masks. In the first permutation mask
39793 the first quarter will contain indexes for the first half
39794 of the op0, the second quarter will contain bit 7 set, third quarter
39795 will contain indexes for the second half of the op0 and the
39796 last quarter bit 7 set. In the second permutation mask
39797 the first quarter will contain bit 7 set, the second quarter
39798 indexes for the first half of the op1, the third quarter bit 7 set
39799 and last quarter indexes for the second half of the op1.
39800 I.e. the first mask e.g. for V32QImode extract even will be:
39801 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
39802 (all values masked with 0xf except for -128) and second mask
39803 for extract even will be
39804 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
39805 m128 = GEN_INT (-128);
39806 for (i = 0; i < nelt; ++i)
39808 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
39809 unsigned which = d->perm[i] >= nelt;
39810 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
39812 for (j = 0; j < eltsz; ++j)
39814 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
39815 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
39819 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
39820 vperm = force_reg (V32QImode, vperm);
39822 l = gen_reg_rtx (V32QImode);
39823 op = gen_lowpart (V32QImode, d->op0);
39824 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
39826 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
39827 vperm = force_reg (V32QImode, vperm);
39829 h = gen_reg_rtx (V32QImode);
39830 op = gen_lowpart (V32QImode, d->op1);
39831 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
39833 ior = gen_reg_rtx (V32QImode);
39834 emit_insn (gen_iorv32qi3 (ior, l, h));
39836 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
39837 op = gen_lowpart (V4DImode, d->target);
39838 ior = gen_lowpart (V4DImode, ior);
39839 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
39840 const1_rtx, GEN_INT (3)));
39842 return true;
39845 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
39846 and extract-odd permutations. */
39848 static bool
39849 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
39851 rtx t1, t2, t3;
39853 switch (d->vmode)
39855 case V4DFmode:
39856 t1 = gen_reg_rtx (V4DFmode);
39857 t2 = gen_reg_rtx (V4DFmode);
39859 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
39860 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
39861 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
39863 /* Now an unpck[lh]pd will produce the result required. */
39864 if (odd)
39865 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
39866 else
39867 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
39868 emit_insn (t3);
39869 break;
39871 case V8SFmode:
39873 int mask = odd ? 0xdd : 0x88;
39875 t1 = gen_reg_rtx (V8SFmode);
39876 t2 = gen_reg_rtx (V8SFmode);
39877 t3 = gen_reg_rtx (V8SFmode);
39879 /* Shuffle within the 128-bit lanes to produce:
39880 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
39881 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
39882 GEN_INT (mask)));
39884 /* Shuffle the lanes around to produce:
39885 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
39886 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
39887 GEN_INT (0x3)));
39889 /* Shuffle within the 128-bit lanes to produce:
39890 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
39891 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
39893 /* Shuffle within the 128-bit lanes to produce:
39894 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
39895 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
39897 /* Shuffle the lanes around to produce:
39898 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
39899 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
39900 GEN_INT (0x20)));
39902 break;
39904 case V2DFmode:
39905 case V4SFmode:
39906 case V2DImode:
39907 case V4SImode:
39908 /* These are always directly implementable by expand_vec_perm_1. */
39909 gcc_unreachable ();
39911 case V8HImode:
39912 if (TARGET_SSSE3)
39913 return expand_vec_perm_pshufb2 (d);
39914 else
39916 /* We need 2*log2(N)-1 operations to achieve odd/even
39917 with interleave. */
39918 t1 = gen_reg_rtx (V8HImode);
39919 t2 = gen_reg_rtx (V8HImode);
39920 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
39921 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
39922 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
39923 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
39924 if (odd)
39925 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
39926 else
39927 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
39928 emit_insn (t3);
39930 break;
39932 case V16QImode:
39933 if (TARGET_SSSE3)
39934 return expand_vec_perm_pshufb2 (d);
39935 else
39937 t1 = gen_reg_rtx (V16QImode);
39938 t2 = gen_reg_rtx (V16QImode);
39939 t3 = gen_reg_rtx (V16QImode);
39940 emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
39941 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
39942 emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
39943 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
39944 emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
39945 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
39946 if (odd)
39947 t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
39948 else
39949 t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
39950 emit_insn (t3);
39952 break;
39954 case V16HImode:
39955 case V32QImode:
39956 return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
39958 case V4DImode:
39959 if (!TARGET_AVX2)
39961 struct expand_vec_perm_d d_copy = *d;
39962 d_copy.vmode = V4DFmode;
39963 d_copy.target = gen_lowpart (V4DFmode, d->target);
39964 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
39965 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
39966 return expand_vec_perm_even_odd_1 (&d_copy, odd);
39969 t1 = gen_reg_rtx (V4DImode);
39970 t2 = gen_reg_rtx (V4DImode);
39972 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
39973 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
39974 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
39976 /* Now an vpunpck[lh]qdq will produce the result required. */
39977 if (odd)
39978 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
39979 else
39980 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
39981 emit_insn (t3);
39982 break;
39984 case V8SImode:
39985 if (!TARGET_AVX2)
39987 struct expand_vec_perm_d d_copy = *d;
39988 d_copy.vmode = V8SFmode;
39989 d_copy.target = gen_lowpart (V8SFmode, d->target);
39990 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
39991 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
39992 return expand_vec_perm_even_odd_1 (&d_copy, odd);
39995 t1 = gen_reg_rtx (V8SImode);
39996 t2 = gen_reg_rtx (V8SImode);
39998 /* Shuffle the lanes around into
39999 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
40000 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1),
40001 gen_lowpart (V4DImode, d->op0),
40002 gen_lowpart (V4DImode, d->op1),
40003 GEN_INT (0x20)));
40004 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2),
40005 gen_lowpart (V4DImode, d->op0),
40006 gen_lowpart (V4DImode, d->op1),
40007 GEN_INT (0x31)));
40009 /* Swap the 2nd and 3rd position in each lane into
40010 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
40011 emit_insn (gen_avx2_pshufdv3 (t1, t1,
40012 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
40013 emit_insn (gen_avx2_pshufdv3 (t2, t2,
40014 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
40016 /* Now an vpunpck[lh]qdq will produce
40017 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
40018 if (odd)
40019 t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target),
40020 gen_lowpart (V4DImode, t1),
40021 gen_lowpart (V4DImode, t2));
40022 else
40023 t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target),
40024 gen_lowpart (V4DImode, t1),
40025 gen_lowpart (V4DImode, t2));
40026 emit_insn (t3);
40027 break;
40029 default:
40030 gcc_unreachable ();
40033 return true;
40036 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
40037 extract-even and extract-odd permutations. */
40039 static bool
40040 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
40042 unsigned i, odd, nelt = d->nelt;
40044 odd = d->perm[0];
40045 if (odd != 0 && odd != 1)
40046 return false;
40048 for (i = 1; i < nelt; ++i)
40049 if (d->perm[i] != 2 * i + odd)
40050 return false;
40052 return expand_vec_perm_even_odd_1 (d, odd);
40055 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
40056 permutations. We assume that expand_vec_perm_1 has already failed. */
40058 static bool
40059 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
40061 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
40062 enum machine_mode vmode = d->vmode;
40063 unsigned char perm2[4];
40064 rtx op0 = d->op0;
40065 bool ok;
40067 switch (vmode)
40069 case V4DFmode:
40070 case V8SFmode:
40071 /* These are special-cased in sse.md so that we can optionally
40072 use the vbroadcast instruction. They expand to two insns
40073 if the input happens to be in a register. */
40074 gcc_unreachable ();
40076 case V2DFmode:
40077 case V2DImode:
40078 case V4SFmode:
40079 case V4SImode:
40080 /* These are always implementable using standard shuffle patterns. */
40081 gcc_unreachable ();
40083 case V8HImode:
40084 case V16QImode:
40085 /* These can be implemented via interleave. We save one insn by
40086 stopping once we have promoted to V4SImode and then use pshufd. */
40089 rtx dest;
40090 rtx (*gen) (rtx, rtx, rtx)
40091 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
40092 : gen_vec_interleave_lowv8hi;
40094 if (elt >= nelt2)
40096 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
40097 : gen_vec_interleave_highv8hi;
40098 elt -= nelt2;
40100 nelt2 /= 2;
40102 dest = gen_reg_rtx (vmode);
40103 emit_insn (gen (dest, op0, op0));
40104 vmode = get_mode_wider_vector (vmode);
40105 op0 = gen_lowpart (vmode, dest);
40107 while (vmode != V4SImode);
40109 memset (perm2, elt, 4);
40110 ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4,
40111 d->testing_p);
40112 gcc_assert (ok);
40113 return true;
40115 case V32QImode:
40116 case V16HImode:
40117 case V8SImode:
40118 case V4DImode:
40119 /* For AVX2 broadcasts of the first element vpbroadcast* or
40120 vpermq should be used by expand_vec_perm_1. */
40121 gcc_assert (!TARGET_AVX2 || d->perm[0]);
40122 return false;
40124 default:
40125 gcc_unreachable ();
40129 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
40130 broadcast permutations. */
40132 static bool
40133 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
40135 unsigned i, elt, nelt = d->nelt;
40137 if (!d->one_operand_p)
40138 return false;
40140 elt = d->perm[0];
40141 for (i = 1; i < nelt; ++i)
40142 if (d->perm[i] != elt)
40143 return false;
40145 return expand_vec_perm_broadcast_1 (d);
40148 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
40149 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
40150 all the shorter instruction sequences. */
40152 static bool
40153 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
40155 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
40156 unsigned int i, nelt, eltsz;
40157 bool used[4];
40159 if (!TARGET_AVX2
40160 || d->one_operand_p
40161 || (d->vmode != V32QImode && d->vmode != V16HImode))
40162 return false;
40164 if (d->testing_p)
40165 return true;
40167 nelt = d->nelt;
40168 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
40170 /* Generate 4 permutation masks. If the required element is within
40171 the same lane, it is shuffled in. If the required element from the
40172 other lane, force a zero by setting bit 7 in the permutation mask.
40173 In the other mask the mask has non-negative elements if element
40174 is requested from the other lane, but also moved to the other lane,
40175 so that the result of vpshufb can have the two V2TImode halves
40176 swapped. */
40177 m128 = GEN_INT (-128);
40178 for (i = 0; i < 32; ++i)
40180 rperm[0][i] = m128;
40181 rperm[1][i] = m128;
40182 rperm[2][i] = m128;
40183 rperm[3][i] = m128;
40185 used[0] = false;
40186 used[1] = false;
40187 used[2] = false;
40188 used[3] = false;
40189 for (i = 0; i < nelt; ++i)
40191 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
40192 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
40193 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
40195 for (j = 0; j < eltsz; ++j)
40196 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
40197 used[which] = true;
40200 for (i = 0; i < 2; ++i)
40202 if (!used[2 * i + 1])
40204 h[i] = NULL_RTX;
40205 continue;
40207 vperm = gen_rtx_CONST_VECTOR (V32QImode,
40208 gen_rtvec_v (32, rperm[2 * i + 1]));
40209 vperm = force_reg (V32QImode, vperm);
40210 h[i] = gen_reg_rtx (V32QImode);
40211 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
40212 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
40215 /* Swap the 128-byte lanes of h[X]. */
40216 for (i = 0; i < 2; ++i)
40218 if (h[i] == NULL_RTX)
40219 continue;
40220 op = gen_reg_rtx (V4DImode);
40221 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
40222 const2_rtx, GEN_INT (3), const0_rtx,
40223 const1_rtx));
40224 h[i] = gen_lowpart (V32QImode, op);
40227 for (i = 0; i < 2; ++i)
40229 if (!used[2 * i])
40231 l[i] = NULL_RTX;
40232 continue;
40234 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
40235 vperm = force_reg (V32QImode, vperm);
40236 l[i] = gen_reg_rtx (V32QImode);
40237 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
40238 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
40241 for (i = 0; i < 2; ++i)
40243 if (h[i] && l[i])
40245 op = gen_reg_rtx (V32QImode);
40246 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
40247 l[i] = op;
40249 else if (h[i])
40250 l[i] = h[i];
40253 gcc_assert (l[0] && l[1]);
40254 op = gen_lowpart (V32QImode, d->target);
40255 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
40256 return true;
40259 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
40260 With all of the interface bits taken care of, perform the expansion
40261 in D and return true on success. */
40263 static bool
40264 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
40266 /* Try a single instruction expansion. */
40267 if (expand_vec_perm_1 (d))
40268 return true;
40270 /* Try sequences of two instructions. */
40272 if (expand_vec_perm_pshuflw_pshufhw (d))
40273 return true;
40275 if (expand_vec_perm_palignr (d))
40276 return true;
40278 if (expand_vec_perm_interleave2 (d))
40279 return true;
40281 if (expand_vec_perm_broadcast (d))
40282 return true;
40284 if (expand_vec_perm_vpermq_perm_1 (d))
40285 return true;
40287 if (expand_vec_perm_vperm2f128 (d))
40288 return true;
40290 /* Try sequences of three instructions. */
40292 if (expand_vec_perm_2vperm2f128_vshuf (d))
40293 return true;
40295 if (expand_vec_perm_pshufb2 (d))
40296 return true;
40298 if (expand_vec_perm_interleave3 (d))
40299 return true;
40301 if (expand_vec_perm_vperm2f128_vblend (d))
40302 return true;
40304 /* Try sequences of four instructions. */
40306 if (expand_vec_perm_vpshufb2_vpermq (d))
40307 return true;
40309 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
40310 return true;
40312 /* ??? Look for narrow permutations whose element orderings would
40313 allow the promotion to a wider mode. */
40315 /* ??? Look for sequences of interleave or a wider permute that place
40316 the data into the correct lanes for a half-vector shuffle like
40317 pshuf[lh]w or vpermilps. */
40319 /* ??? Look for sequences of interleave that produce the desired results.
40320 The combinatorics of punpck[lh] get pretty ugly... */
40322 if (expand_vec_perm_even_odd (d))
40323 return true;
40325 /* Even longer sequences. */
40326 if (expand_vec_perm_vpshufb4_vpermq2 (d))
40327 return true;
40329 return false;
40332 /* If a permutation only uses one operand, make it clear. Returns true
40333 if the permutation references both operands. */
40335 static bool
40336 canonicalize_perm (struct expand_vec_perm_d *d)
40338 int i, which, nelt = d->nelt;
40340 for (i = which = 0; i < nelt; ++i)
40341 which |= (d->perm[i] < nelt ? 1 : 2);
40343 d->one_operand_p = true;
40344 switch (which)
40346 default:
40347 gcc_unreachable();
40349 case 3:
40350 if (!rtx_equal_p (d->op0, d->op1))
40352 d->one_operand_p = false;
40353 break;
40355 /* The elements of PERM do not suggest that only the first operand
40356 is used, but both operands are identical. Allow easier matching
40357 of the permutation by folding the permutation into the single
40358 input vector. */
40359 /* FALLTHRU */
40361 case 2:
40362 for (i = 0; i < nelt; ++i)
40363 d->perm[i] &= nelt - 1;
40364 d->op0 = d->op1;
40365 break;
40367 case 1:
40368 d->op1 = d->op0;
40369 break;
40372 return (which == 3);
40375 bool
40376 ix86_expand_vec_perm_const (rtx operands[4])
40378 struct expand_vec_perm_d d;
40379 unsigned char perm[MAX_VECT_LEN];
40380 int i, nelt;
40381 bool two_args;
40382 rtx sel;
40384 d.target = operands[0];
40385 d.op0 = operands[1];
40386 d.op1 = operands[2];
40387 sel = operands[3];
40389 d.vmode = GET_MODE (d.target);
40390 gcc_assert (VECTOR_MODE_P (d.vmode));
40391 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40392 d.testing_p = false;
40394 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
40395 gcc_assert (XVECLEN (sel, 0) == nelt);
40396 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
40398 for (i = 0; i < nelt; ++i)
40400 rtx e = XVECEXP (sel, 0, i);
40401 int ei = INTVAL (e) & (2 * nelt - 1);
40402 d.perm[i] = ei;
40403 perm[i] = ei;
40406 two_args = canonicalize_perm (&d);
40408 if (ix86_expand_vec_perm_const_1 (&d))
40409 return true;
40411 /* If the selector says both arguments are needed, but the operands are the
40412 same, the above tried to expand with one_operand_p and flattened selector.
40413 If that didn't work, retry without one_operand_p; we succeeded with that
40414 during testing. */
40415 if (two_args && d.one_operand_p)
40417 d.one_operand_p = false;
40418 memcpy (d.perm, perm, sizeof (perm));
40419 return ix86_expand_vec_perm_const_1 (&d);
40422 return false;
40425 /* Implement targetm.vectorize.vec_perm_const_ok. */
40427 static bool
40428 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
40429 const unsigned char *sel)
40431 struct expand_vec_perm_d d;
40432 unsigned int i, nelt, which;
40433 bool ret;
40435 d.vmode = vmode;
40436 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40437 d.testing_p = true;
40439 /* Given sufficient ISA support we can just return true here
40440 for selected vector modes. */
40441 if (GET_MODE_SIZE (d.vmode) == 16)
40443 /* All implementable with a single vpperm insn. */
40444 if (TARGET_XOP)
40445 return true;
40446 /* All implementable with 2 pshufb + 1 ior. */
40447 if (TARGET_SSSE3)
40448 return true;
40449 /* All implementable with shufpd or unpck[lh]pd. */
40450 if (d.nelt == 2)
40451 return true;
40454 /* Extract the values from the vector CST into the permutation
40455 array in D. */
40456 memcpy (d.perm, sel, nelt);
40457 for (i = which = 0; i < nelt; ++i)
40459 unsigned char e = d.perm[i];
40460 gcc_assert (e < 2 * nelt);
40461 which |= (e < nelt ? 1 : 2);
40464 /* For all elements from second vector, fold the elements to first. */
40465 if (which == 2)
40466 for (i = 0; i < nelt; ++i)
40467 d.perm[i] -= nelt;
40469 /* Check whether the mask can be applied to the vector type. */
40470 d.one_operand_p = (which != 3);
40472 /* Implementable with shufps or pshufd. */
40473 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
40474 return true;
40476 /* Otherwise we have to go through the motions and see if we can
40477 figure out how to generate the requested permutation. */
40478 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
40479 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
40480 if (!d.one_operand_p)
40481 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
40483 start_sequence ();
40484 ret = ix86_expand_vec_perm_const_1 (&d);
40485 end_sequence ();
40487 return ret;
40490 void
40491 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
40493 struct expand_vec_perm_d d;
40494 unsigned i, nelt;
40496 d.target = targ;
40497 d.op0 = op0;
40498 d.op1 = op1;
40499 d.vmode = GET_MODE (targ);
40500 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40501 d.one_operand_p = false;
40502 d.testing_p = false;
40504 for (i = 0; i < nelt; ++i)
40505 d.perm[i] = i * 2 + odd;
40507 /* We'll either be able to implement the permutation directly... */
40508 if (expand_vec_perm_1 (&d))
40509 return;
40511 /* ... or we use the special-case patterns. */
40512 expand_vec_perm_even_odd_1 (&d, odd);
40515 static void
40516 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
40518 struct expand_vec_perm_d d;
40519 unsigned i, nelt, base;
40520 bool ok;
40522 d.target = targ;
40523 d.op0 = op0;
40524 d.op1 = op1;
40525 d.vmode = GET_MODE (targ);
40526 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40527 d.one_operand_p = false;
40528 d.testing_p = false;
40530 base = high_p ? nelt / 2 : 0;
40531 for (i = 0; i < nelt / 2; ++i)
40533 d.perm[i * 2] = i + base;
40534 d.perm[i * 2 + 1] = i + base + nelt;
40537 /* Note that for AVX this isn't one instruction. */
40538 ok = ix86_expand_vec_perm_const_1 (&d);
40539 gcc_assert (ok);
40543 /* Expand a vector operation CODE for a V*QImode in terms of the
40544 same operation on V*HImode. */
40546 void
40547 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
40549 enum machine_mode qimode = GET_MODE (dest);
40550 enum machine_mode himode;
40551 rtx (*gen_il) (rtx, rtx, rtx);
40552 rtx (*gen_ih) (rtx, rtx, rtx);
40553 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
40554 struct expand_vec_perm_d d;
40555 bool ok, full_interleave;
40556 bool uns_p = false;
40557 int i;
40559 switch (qimode)
40561 case V16QImode:
40562 himode = V8HImode;
40563 gen_il = gen_vec_interleave_lowv16qi;
40564 gen_ih = gen_vec_interleave_highv16qi;
40565 break;
40566 case V32QImode:
40567 himode = V16HImode;
40568 gen_il = gen_avx2_interleave_lowv32qi;
40569 gen_ih = gen_avx2_interleave_highv32qi;
40570 break;
40571 default:
40572 gcc_unreachable ();
40575 op2_l = op2_h = op2;
40576 switch (code)
40578 case MULT:
40579 /* Unpack data such that we've got a source byte in each low byte of
40580 each word. We don't care what goes into the high byte of each word.
40581 Rather than trying to get zero in there, most convenient is to let
40582 it be a copy of the low byte. */
40583 op2_l = gen_reg_rtx (qimode);
40584 op2_h = gen_reg_rtx (qimode);
40585 emit_insn (gen_il (op2_l, op2, op2));
40586 emit_insn (gen_ih (op2_h, op2, op2));
40587 /* FALLTHRU */
40589 op1_l = gen_reg_rtx (qimode);
40590 op1_h = gen_reg_rtx (qimode);
40591 emit_insn (gen_il (op1_l, op1, op1));
40592 emit_insn (gen_ih (op1_h, op1, op1));
40593 full_interleave = qimode == V16QImode;
40594 break;
40596 case ASHIFT:
40597 case LSHIFTRT:
40598 uns_p = true;
40599 /* FALLTHRU */
40600 case ASHIFTRT:
40601 op1_l = gen_reg_rtx (himode);
40602 op1_h = gen_reg_rtx (himode);
40603 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
40604 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
40605 full_interleave = true;
40606 break;
40607 default:
40608 gcc_unreachable ();
40611 /* Perform the operation. */
40612 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
40613 1, OPTAB_DIRECT);
40614 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
40615 1, OPTAB_DIRECT);
40616 gcc_assert (res_l && res_h);
40618 /* Merge the data back into the right place. */
40619 d.target = dest;
40620 d.op0 = gen_lowpart (qimode, res_l);
40621 d.op1 = gen_lowpart (qimode, res_h);
40622 d.vmode = qimode;
40623 d.nelt = GET_MODE_NUNITS (qimode);
40624 d.one_operand_p = false;
40625 d.testing_p = false;
40627 if (full_interleave)
40629 /* For SSE2, we used an full interleave, so the desired
40630 results are in the even elements. */
40631 for (i = 0; i < 32; ++i)
40632 d.perm[i] = i * 2;
40634 else
40636 /* For AVX, the interleave used above was not cross-lane. So the
40637 extraction is evens but with the second and third quarter swapped.
40638 Happily, that is even one insn shorter than even extraction. */
40639 for (i = 0; i < 32; ++i)
40640 d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
40643 ok = ix86_expand_vec_perm_const_1 (&d);
40644 gcc_assert (ok);
40646 set_unique_reg_note (get_last_insn (), REG_EQUAL,
40647 gen_rtx_fmt_ee (code, qimode, op1, op2));
40650 void
40651 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
40652 bool uns_p, bool odd_p)
40654 enum machine_mode mode = GET_MODE (op1);
40655 enum machine_mode wmode = GET_MODE (dest);
40656 rtx x;
40658 /* We only play even/odd games with vectors of SImode. */
40659 gcc_assert (mode == V4SImode || mode == V8SImode);
40661 /* If we're looking for the odd results, shift those members down to
40662 the even slots. For some cpus this is faster than a PSHUFD. */
40663 if (odd_p)
40665 if (TARGET_XOP && mode == V4SImode)
40667 x = force_reg (wmode, CONST0_RTX (wmode));
40668 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
40669 return;
40672 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
40673 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
40674 x, NULL, 1, OPTAB_DIRECT);
40675 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
40676 x, NULL, 1, OPTAB_DIRECT);
40677 op1 = gen_lowpart (mode, op1);
40678 op2 = gen_lowpart (mode, op2);
40681 if (mode == V8SImode)
40683 if (uns_p)
40684 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
40685 else
40686 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
40688 else if (uns_p)
40689 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
40690 else if (TARGET_SSE4_1)
40691 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
40692 else
40694 rtx s1, s2, t0, t1, t2;
40696 /* The easiest way to implement this without PMULDQ is to go through
40697 the motions as if we are performing a full 64-bit multiply. With
40698 the exception that we need to do less shuffling of the elements. */
40700 /* Compute the sign-extension, aka highparts, of the two operands. */
40701 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
40702 op1, pc_rtx, pc_rtx);
40703 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
40704 op2, pc_rtx, pc_rtx);
40706 /* Multiply LO(A) * HI(B), and vice-versa. */
40707 t1 = gen_reg_rtx (wmode);
40708 t2 = gen_reg_rtx (wmode);
40709 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
40710 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
40712 /* Multiply LO(A) * LO(B). */
40713 t0 = gen_reg_rtx (wmode);
40714 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
40716 /* Combine and shift the highparts into place. */
40717 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
40718 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
40719 1, OPTAB_DIRECT);
40721 /* Combine high and low parts. */
40722 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
40723 return;
40725 emit_insn (x);
40728 void
40729 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
40730 bool uns_p, bool high_p)
40732 enum machine_mode wmode = GET_MODE (dest);
40733 enum machine_mode mode = GET_MODE (op1);
40734 rtx t1, t2, t3, t4, mask;
40736 switch (mode)
40738 case V4SImode:
40739 t1 = gen_reg_rtx (mode);
40740 t2 = gen_reg_rtx (mode);
40741 if (TARGET_XOP && !uns_p)
40743 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
40744 shuffle the elements once so that all elements are in the right
40745 place for immediate use: { A C B D }. */
40746 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
40747 const1_rtx, GEN_INT (3)));
40748 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
40749 const1_rtx, GEN_INT (3)));
40751 else
40753 /* Put the elements into place for the multiply. */
40754 ix86_expand_vec_interleave (t1, op1, op1, high_p);
40755 ix86_expand_vec_interleave (t2, op2, op2, high_p);
40756 high_p = false;
40758 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
40759 break;
40761 case V8SImode:
40762 /* Shuffle the elements between the lanes. After this we
40763 have { A B E F | C D G H } for each operand. */
40764 t1 = gen_reg_rtx (V4DImode);
40765 t2 = gen_reg_rtx (V4DImode);
40766 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
40767 const0_rtx, const2_rtx,
40768 const1_rtx, GEN_INT (3)));
40769 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
40770 const0_rtx, const2_rtx,
40771 const1_rtx, GEN_INT (3)));
40773 /* Shuffle the elements within the lanes. After this we
40774 have { A A B B | C C D D } or { E E F F | G G H H }. */
40775 t3 = gen_reg_rtx (V8SImode);
40776 t4 = gen_reg_rtx (V8SImode);
40777 mask = GEN_INT (high_p
40778 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
40779 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
40780 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
40781 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
40783 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
40784 break;
40786 case V8HImode:
40787 case V16HImode:
40788 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
40789 uns_p, OPTAB_DIRECT);
40790 t2 = expand_binop (mode,
40791 uns_p ? umul_highpart_optab : smul_highpart_optab,
40792 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
40793 gcc_assert (t1 && t2);
40795 ix86_expand_vec_interleave (gen_lowpart (mode, dest), t1, t2, high_p);
40796 break;
40798 case V16QImode:
40799 case V32QImode:
40800 t1 = gen_reg_rtx (wmode);
40801 t2 = gen_reg_rtx (wmode);
40802 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
40803 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
40805 emit_insn (gen_rtx_SET (VOIDmode, dest, gen_rtx_MULT (wmode, t1, t2)));
40806 break;
40808 default:
40809 gcc_unreachable ();
40813 void
40814 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
40816 rtx res_1, res_2;
40818 res_1 = gen_reg_rtx (V4SImode);
40819 res_2 = gen_reg_rtx (V4SImode);
40820 ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_1),
40821 op1, op2, true, false);
40822 ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_2),
40823 op1, op2, true, true);
40825 /* Move the results in element 2 down to element 1; we don't care
40826 what goes in elements 2 and 3. Then we can merge the parts
40827 back together with an interleave.
40829 Note that two other sequences were tried:
40830 (1) Use interleaves at the start instead of psrldq, which allows
40831 us to use a single shufps to merge things back at the end.
40832 (2) Use shufps here to combine the two vectors, then pshufd to
40833 put the elements in the correct order.
40834 In both cases the cost of the reformatting stall was too high
40835 and the overall sequence slower. */
40837 emit_insn (gen_sse2_pshufd_1 (res_1, res_1, const0_rtx, const2_rtx,
40838 const0_rtx, const0_rtx));
40839 emit_insn (gen_sse2_pshufd_1 (res_2, res_2, const0_rtx, const2_rtx,
40840 const0_rtx, const0_rtx));
40841 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
40843 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
40846 void
40847 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
40849 enum machine_mode mode = GET_MODE (op0);
40850 rtx t1, t2, t3, t4, t5, t6;
40852 if (TARGET_XOP && mode == V2DImode)
40854 /* op1: A,B,C,D, op2: E,F,G,H */
40855 op1 = gen_lowpart (V4SImode, op1);
40856 op2 = gen_lowpart (V4SImode, op2);
40858 t1 = gen_reg_rtx (V4SImode);
40859 t2 = gen_reg_rtx (V4SImode);
40860 t3 = gen_reg_rtx (V2DImode);
40861 t4 = gen_reg_rtx (V2DImode);
40863 /* t1: B,A,D,C */
40864 emit_insn (gen_sse2_pshufd_1 (t1, op1,
40865 GEN_INT (1),
40866 GEN_INT (0),
40867 GEN_INT (3),
40868 GEN_INT (2)));
40870 /* t2: (B*E),(A*F),(D*G),(C*H) */
40871 emit_insn (gen_mulv4si3 (t2, t1, op2));
40873 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
40874 emit_insn (gen_xop_phadddq (t3, t2));
40876 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
40877 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
40879 /* op0: (((B*E)+(A*F))<<32)+(B*F), (((D*G)+(C*H))<<32)+(D*H) */
40880 emit_insn (gen_xop_pmacsdql (op0, op1, op2, t4));
40882 else
40884 enum machine_mode nmode;
40885 rtx (*umul) (rtx, rtx, rtx);
40887 if (mode == V2DImode)
40889 umul = gen_vec_widen_umult_even_v4si;
40890 nmode = V4SImode;
40892 else if (mode == V4DImode)
40894 umul = gen_vec_widen_umult_even_v8si;
40895 nmode = V8SImode;
40897 else
40898 gcc_unreachable ();
40901 /* Multiply low parts. */
40902 t1 = gen_reg_rtx (mode);
40903 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
40905 /* Shift input vectors right 32 bits so we can multiply high parts. */
40906 t6 = GEN_INT (32);
40907 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
40908 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
40910 /* Multiply high parts by low parts. */
40911 t4 = gen_reg_rtx (mode);
40912 t5 = gen_reg_rtx (mode);
40913 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
40914 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
40916 /* Combine and shift the highparts back. */
40917 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
40918 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
40920 /* Combine high and low parts. */
40921 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
40924 set_unique_reg_note (get_last_insn (), REG_EQUAL,
40925 gen_rtx_MULT (mode, op1, op2));
40928 /* Expand an insert into a vector register through pinsr insn.
40929 Return true if successful. */
40931 bool
40932 ix86_expand_pinsr (rtx *operands)
40934 rtx dst = operands[0];
40935 rtx src = operands[3];
40937 unsigned int size = INTVAL (operands[1]);
40938 unsigned int pos = INTVAL (operands[2]);
40940 if (GET_CODE (dst) == SUBREG)
40942 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
40943 dst = SUBREG_REG (dst);
40946 if (GET_CODE (src) == SUBREG)
40947 src = SUBREG_REG (src);
40949 switch (GET_MODE (dst))
40951 case V16QImode:
40952 case V8HImode:
40953 case V4SImode:
40954 case V2DImode:
40956 enum machine_mode srcmode, dstmode;
40957 rtx (*pinsr)(rtx, rtx, rtx, rtx);
40959 srcmode = mode_for_size (size, MODE_INT, 0);
40961 switch (srcmode)
40963 case QImode:
40964 if (!TARGET_SSE4_1)
40965 return false;
40966 dstmode = V16QImode;
40967 pinsr = gen_sse4_1_pinsrb;
40968 break;
40970 case HImode:
40971 if (!TARGET_SSE2)
40972 return false;
40973 dstmode = V8HImode;
40974 pinsr = gen_sse2_pinsrw;
40975 break;
40977 case SImode:
40978 if (!TARGET_SSE4_1)
40979 return false;
40980 dstmode = V4SImode;
40981 pinsr = gen_sse4_1_pinsrd;
40982 break;
40984 case DImode:
40985 gcc_assert (TARGET_64BIT);
40986 if (!TARGET_SSE4_1)
40987 return false;
40988 dstmode = V2DImode;
40989 pinsr = gen_sse4_1_pinsrq;
40990 break;
40992 default:
40993 return false;
40996 dst = gen_lowpart (dstmode, dst);
40997 src = gen_lowpart (srcmode, src);
40999 pos /= size;
41001 emit_insn (pinsr (dst, dst, src, GEN_INT (1 << pos)));
41002 return true;
41005 default:
41006 return false;
41010 /* This function returns the calling abi specific va_list type node.
41011 It returns the FNDECL specific va_list type. */
41013 static tree
41014 ix86_fn_abi_va_list (tree fndecl)
41016 if (!TARGET_64BIT)
41017 return va_list_type_node;
41018 gcc_assert (fndecl != NULL_TREE);
41020 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
41021 return ms_va_list_type_node;
41022 else
41023 return sysv_va_list_type_node;
41026 /* Returns the canonical va_list type specified by TYPE. If there
41027 is no valid TYPE provided, it return NULL_TREE. */
41029 static tree
41030 ix86_canonical_va_list_type (tree type)
41032 tree wtype, htype;
41034 /* Resolve references and pointers to va_list type. */
41035 if (TREE_CODE (type) == MEM_REF)
41036 type = TREE_TYPE (type);
41037 else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
41038 type = TREE_TYPE (type);
41039 else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
41040 type = TREE_TYPE (type);
41042 if (TARGET_64BIT && va_list_type_node != NULL_TREE)
41044 wtype = va_list_type_node;
41045 gcc_assert (wtype != NULL_TREE);
41046 htype = type;
41047 if (TREE_CODE (wtype) == ARRAY_TYPE)
41049 /* If va_list is an array type, the argument may have decayed
41050 to a pointer type, e.g. by being passed to another function.
41051 In that case, unwrap both types so that we can compare the
41052 underlying records. */
41053 if (TREE_CODE (htype) == ARRAY_TYPE
41054 || POINTER_TYPE_P (htype))
41056 wtype = TREE_TYPE (wtype);
41057 htype = TREE_TYPE (htype);
41060 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
41061 return va_list_type_node;
41062 wtype = sysv_va_list_type_node;
41063 gcc_assert (wtype != NULL_TREE);
41064 htype = type;
41065 if (TREE_CODE (wtype) == ARRAY_TYPE)
41067 /* If va_list is an array type, the argument may have decayed
41068 to a pointer type, e.g. by being passed to another function.
41069 In that case, unwrap both types so that we can compare the
41070 underlying records. */
41071 if (TREE_CODE (htype) == ARRAY_TYPE
41072 || POINTER_TYPE_P (htype))
41074 wtype = TREE_TYPE (wtype);
41075 htype = TREE_TYPE (htype);
41078 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
41079 return sysv_va_list_type_node;
41080 wtype = ms_va_list_type_node;
41081 gcc_assert (wtype != NULL_TREE);
41082 htype = type;
41083 if (TREE_CODE (wtype) == ARRAY_TYPE)
41085 /* If va_list is an array type, the argument may have decayed
41086 to a pointer type, e.g. by being passed to another function.
41087 In that case, unwrap both types so that we can compare the
41088 underlying records. */
41089 if (TREE_CODE (htype) == ARRAY_TYPE
41090 || POINTER_TYPE_P (htype))
41092 wtype = TREE_TYPE (wtype);
41093 htype = TREE_TYPE (htype);
41096 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
41097 return ms_va_list_type_node;
41098 return NULL_TREE;
41100 return std_canonical_va_list_type (type);
41103 /* Iterate through the target-specific builtin types for va_list.
41104 IDX denotes the iterator, *PTREE is set to the result type of
41105 the va_list builtin, and *PNAME to its internal type.
41106 Returns zero if there is no element for this index, otherwise
41107 IDX should be increased upon the next call.
41108 Note, do not iterate a base builtin's name like __builtin_va_list.
41109 Used from c_common_nodes_and_builtins. */
41111 static int
41112 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
41114 if (TARGET_64BIT)
41116 switch (idx)
41118 default:
41119 break;
41121 case 0:
41122 *ptree = ms_va_list_type_node;
41123 *pname = "__builtin_ms_va_list";
41124 return 1;
41126 case 1:
41127 *ptree = sysv_va_list_type_node;
41128 *pname = "__builtin_sysv_va_list";
41129 return 1;
41133 return 0;
41136 #undef TARGET_SCHED_DISPATCH
41137 #define TARGET_SCHED_DISPATCH has_dispatch
41138 #undef TARGET_SCHED_DISPATCH_DO
41139 #define TARGET_SCHED_DISPATCH_DO do_dispatch
41140 #undef TARGET_SCHED_REASSOCIATION_WIDTH
41141 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
41142 #undef TARGET_SCHED_REORDER
41143 #define TARGET_SCHED_REORDER ix86_sched_reorder
41144 #undef TARGET_SCHED_ADJUST_PRIORITY
41145 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
41146 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
41147 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK ix86_dependencies_evaluation_hook
41149 /* The size of the dispatch window is the total number of bytes of
41150 object code allowed in a window. */
41151 #define DISPATCH_WINDOW_SIZE 16
41153 /* Number of dispatch windows considered for scheduling. */
41154 #define MAX_DISPATCH_WINDOWS 3
41156 /* Maximum number of instructions in a window. */
41157 #define MAX_INSN 4
41159 /* Maximum number of immediate operands in a window. */
41160 #define MAX_IMM 4
41162 /* Maximum number of immediate bits allowed in a window. */
41163 #define MAX_IMM_SIZE 128
41165 /* Maximum number of 32 bit immediates allowed in a window. */
41166 #define MAX_IMM_32 4
41168 /* Maximum number of 64 bit immediates allowed in a window. */
41169 #define MAX_IMM_64 2
41171 /* Maximum total of loads or prefetches allowed in a window. */
41172 #define MAX_LOAD 2
41174 /* Maximum total of stores allowed in a window. */
41175 #define MAX_STORE 1
41177 #undef BIG
41178 #define BIG 100
41181 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
41182 enum dispatch_group {
41183 disp_no_group = 0,
41184 disp_load,
41185 disp_store,
41186 disp_load_store,
41187 disp_prefetch,
41188 disp_imm,
41189 disp_imm_32,
41190 disp_imm_64,
41191 disp_branch,
41192 disp_cmp,
41193 disp_jcc,
41194 disp_last
41197 /* Number of allowable groups in a dispatch window. It is an array
41198 indexed by dispatch_group enum. 100 is used as a big number,
41199 because the number of these kind of operations does not have any
41200 effect in dispatch window, but we need them for other reasons in
41201 the table. */
41202 static unsigned int num_allowable_groups[disp_last] = {
41203 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
41206 char group_name[disp_last + 1][16] = {
41207 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
41208 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
41209 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
41212 /* Instruction path. */
41213 enum insn_path {
41214 no_path = 0,
41215 path_single, /* Single micro op. */
41216 path_double, /* Double micro op. */
41217 path_multi, /* Instructions with more than 2 micro op.. */
41218 last_path
41221 /* sched_insn_info defines a window to the instructions scheduled in
41222 the basic block. It contains a pointer to the insn_info table and
41223 the instruction scheduled.
41225 Windows are allocated for each basic block and are linked
41226 together. */
41227 typedef struct sched_insn_info_s {
41228 rtx insn;
41229 enum dispatch_group group;
41230 enum insn_path path;
41231 int byte_len;
41232 int imm_bytes;
41233 } sched_insn_info;
41235 /* Linked list of dispatch windows. This is a two way list of
41236 dispatch windows of a basic block. It contains information about
41237 the number of uops in the window and the total number of
41238 instructions and of bytes in the object code for this dispatch
41239 window. */
41240 typedef struct dispatch_windows_s {
41241 int num_insn; /* Number of insn in the window. */
41242 int num_uops; /* Number of uops in the window. */
41243 int window_size; /* Number of bytes in the window. */
41244 int window_num; /* Window number between 0 or 1. */
41245 int num_imm; /* Number of immediates in an insn. */
41246 int num_imm_32; /* Number of 32 bit immediates in an insn. */
41247 int num_imm_64; /* Number of 64 bit immediates in an insn. */
41248 int imm_size; /* Total immediates in the window. */
41249 int num_loads; /* Total memory loads in the window. */
41250 int num_stores; /* Total memory stores in the window. */
41251 int violation; /* Violation exists in window. */
41252 sched_insn_info *window; /* Pointer to the window. */
41253 struct dispatch_windows_s *next;
41254 struct dispatch_windows_s *prev;
41255 } dispatch_windows;
41257 /* Immediate valuse used in an insn. */
41258 typedef struct imm_info_s
41260 int imm;
41261 int imm32;
41262 int imm64;
41263 } imm_info;
41265 static dispatch_windows *dispatch_window_list;
41266 static dispatch_windows *dispatch_window_list1;
41268 /* Get dispatch group of insn. */
41270 static enum dispatch_group
41271 get_mem_group (rtx insn)
41273 enum attr_memory memory;
41275 if (INSN_CODE (insn) < 0)
41276 return disp_no_group;
41277 memory = get_attr_memory (insn);
41278 if (memory == MEMORY_STORE)
41279 return disp_store;
41281 if (memory == MEMORY_LOAD)
41282 return disp_load;
41284 if (memory == MEMORY_BOTH)
41285 return disp_load_store;
41287 return disp_no_group;
41290 /* Return true if insn is a compare instruction. */
41292 static bool
41293 is_cmp (rtx insn)
41295 enum attr_type type;
41297 type = get_attr_type (insn);
41298 return (type == TYPE_TEST
41299 || type == TYPE_ICMP
41300 || type == TYPE_FCMP
41301 || GET_CODE (PATTERN (insn)) == COMPARE);
41304 /* Return true if a dispatch violation encountered. */
41306 static bool
41307 dispatch_violation (void)
41309 if (dispatch_window_list->next)
41310 return dispatch_window_list->next->violation;
41311 return dispatch_window_list->violation;
41314 /* Return true if insn is a branch instruction. */
41316 static bool
41317 is_branch (rtx insn)
41319 return (CALL_P (insn) || JUMP_P (insn));
41322 /* Return true if insn is a prefetch instruction. */
41324 static bool
41325 is_prefetch (rtx insn)
41327 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
41330 /* This function initializes a dispatch window and the list container holding a
41331 pointer to the window. */
41333 static void
41334 init_window (int window_num)
41336 int i;
41337 dispatch_windows *new_list;
41339 if (window_num == 0)
41340 new_list = dispatch_window_list;
41341 else
41342 new_list = dispatch_window_list1;
41344 new_list->num_insn = 0;
41345 new_list->num_uops = 0;
41346 new_list->window_size = 0;
41347 new_list->next = NULL;
41348 new_list->prev = NULL;
41349 new_list->window_num = window_num;
41350 new_list->num_imm = 0;
41351 new_list->num_imm_32 = 0;
41352 new_list->num_imm_64 = 0;
41353 new_list->imm_size = 0;
41354 new_list->num_loads = 0;
41355 new_list->num_stores = 0;
41356 new_list->violation = false;
41358 for (i = 0; i < MAX_INSN; i++)
41360 new_list->window[i].insn = NULL;
41361 new_list->window[i].group = disp_no_group;
41362 new_list->window[i].path = no_path;
41363 new_list->window[i].byte_len = 0;
41364 new_list->window[i].imm_bytes = 0;
41366 return;
41369 /* This function allocates and initializes a dispatch window and the
41370 list container holding a pointer to the window. */
41372 static dispatch_windows *
41373 allocate_window (void)
41375 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
41376 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
41378 return new_list;
41381 /* This routine initializes the dispatch scheduling information. It
41382 initiates building dispatch scheduler tables and constructs the
41383 first dispatch window. */
41385 static void
41386 init_dispatch_sched (void)
41388 /* Allocate a dispatch list and a window. */
41389 dispatch_window_list = allocate_window ();
41390 dispatch_window_list1 = allocate_window ();
41391 init_window (0);
41392 init_window (1);
41395 /* This function returns true if a branch is detected. End of a basic block
41396 does not have to be a branch, but here we assume only branches end a
41397 window. */
41399 static bool
41400 is_end_basic_block (enum dispatch_group group)
41402 return group == disp_branch;
41405 /* This function is called when the end of a window processing is reached. */
41407 static void
41408 process_end_window (void)
41410 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
41411 if (dispatch_window_list->next)
41413 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
41414 gcc_assert (dispatch_window_list->window_size
41415 + dispatch_window_list1->window_size <= 48);
41416 init_window (1);
41418 init_window (0);
41421 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
41422 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
41423 for 48 bytes of instructions. Note that these windows are not dispatch
41424 windows that their sizes are DISPATCH_WINDOW_SIZE. */
41426 static dispatch_windows *
41427 allocate_next_window (int window_num)
41429 if (window_num == 0)
41431 if (dispatch_window_list->next)
41432 init_window (1);
41433 init_window (0);
41434 return dispatch_window_list;
41437 dispatch_window_list->next = dispatch_window_list1;
41438 dispatch_window_list1->prev = dispatch_window_list;
41440 return dispatch_window_list1;
41443 /* Increment the number of immediate operands of an instruction. */
41445 static int
41446 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
41448 if (*in_rtx == 0)
41449 return 0;
41451 switch ( GET_CODE (*in_rtx))
41453 case CONST:
41454 case SYMBOL_REF:
41455 case CONST_INT:
41456 (imm_values->imm)++;
41457 if (x86_64_immediate_operand (*in_rtx, SImode))
41458 (imm_values->imm32)++;
41459 else
41460 (imm_values->imm64)++;
41461 break;
41463 case CONST_DOUBLE:
41464 (imm_values->imm)++;
41465 (imm_values->imm64)++;
41466 break;
41468 case CODE_LABEL:
41469 if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
41471 (imm_values->imm)++;
41472 (imm_values->imm32)++;
41474 break;
41476 default:
41477 break;
41480 return 0;
41483 /* Compute number of immediate operands of an instruction. */
41485 static void
41486 find_constant (rtx in_rtx, imm_info *imm_values)
41488 for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
41489 (rtx_function) find_constant_1, (void *) imm_values);
41492 /* Return total size of immediate operands of an instruction along with number
41493 of corresponding immediate-operands. It initializes its parameters to zero
41494 befor calling FIND_CONSTANT.
41495 INSN is the input instruction. IMM is the total of immediates.
41496 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
41497 bit immediates. */
41499 static int
41500 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
41502 imm_info imm_values = {0, 0, 0};
41504 find_constant (insn, &imm_values);
41505 *imm = imm_values.imm;
41506 *imm32 = imm_values.imm32;
41507 *imm64 = imm_values.imm64;
41508 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
41511 /* This function indicates if an operand of an instruction is an
41512 immediate. */
41514 static bool
41515 has_immediate (rtx insn)
41517 int num_imm_operand;
41518 int num_imm32_operand;
41519 int num_imm64_operand;
41521 if (insn)
41522 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
41523 &num_imm64_operand);
41524 return false;
41527 /* Return single or double path for instructions. */
41529 static enum insn_path
41530 get_insn_path (rtx insn)
41532 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
41534 if ((int)path == 0)
41535 return path_single;
41537 if ((int)path == 1)
41538 return path_double;
41540 return path_multi;
41543 /* Return insn dispatch group. */
41545 static enum dispatch_group
41546 get_insn_group (rtx insn)
41548 enum dispatch_group group = get_mem_group (insn);
41549 if (group)
41550 return group;
41552 if (is_branch (insn))
41553 return disp_branch;
41555 if (is_cmp (insn))
41556 return disp_cmp;
41558 if (has_immediate (insn))
41559 return disp_imm;
41561 if (is_prefetch (insn))
41562 return disp_prefetch;
41564 return disp_no_group;
41567 /* Count number of GROUP restricted instructions in a dispatch
41568 window WINDOW_LIST. */
41570 static int
41571 count_num_restricted (rtx insn, dispatch_windows *window_list)
41573 enum dispatch_group group = get_insn_group (insn);
41574 int imm_size;
41575 int num_imm_operand;
41576 int num_imm32_operand;
41577 int num_imm64_operand;
41579 if (group == disp_no_group)
41580 return 0;
41582 if (group == disp_imm)
41584 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
41585 &num_imm64_operand);
41586 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
41587 || num_imm_operand + window_list->num_imm > MAX_IMM
41588 || (num_imm32_operand > 0
41589 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
41590 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
41591 || (num_imm64_operand > 0
41592 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
41593 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
41594 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
41595 && num_imm64_operand > 0
41596 && ((window_list->num_imm_64 > 0
41597 && window_list->num_insn >= 2)
41598 || window_list->num_insn >= 3)))
41599 return BIG;
41601 return 1;
41604 if ((group == disp_load_store
41605 && (window_list->num_loads >= MAX_LOAD
41606 || window_list->num_stores >= MAX_STORE))
41607 || ((group == disp_load
41608 || group == disp_prefetch)
41609 && window_list->num_loads >= MAX_LOAD)
41610 || (group == disp_store
41611 && window_list->num_stores >= MAX_STORE))
41612 return BIG;
41614 return 1;
41617 /* This function returns true if insn satisfies dispatch rules on the
41618 last window scheduled. */
41620 static bool
41621 fits_dispatch_window (rtx insn)
41623 dispatch_windows *window_list = dispatch_window_list;
41624 dispatch_windows *window_list_next = dispatch_window_list->next;
41625 unsigned int num_restrict;
41626 enum dispatch_group group = get_insn_group (insn);
41627 enum insn_path path = get_insn_path (insn);
41628 int sum;
41630 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
41631 instructions should be given the lowest priority in the
41632 scheduling process in Haifa scheduler to make sure they will be
41633 scheduled in the same dispatch window as the reference to them. */
41634 if (group == disp_jcc || group == disp_cmp)
41635 return false;
41637 /* Check nonrestricted. */
41638 if (group == disp_no_group || group == disp_branch)
41639 return true;
41641 /* Get last dispatch window. */
41642 if (window_list_next)
41643 window_list = window_list_next;
41645 if (window_list->window_num == 1)
41647 sum = window_list->prev->window_size + window_list->window_size;
41649 if (sum == 32
41650 || (min_insn_size (insn) + sum) >= 48)
41651 /* Window 1 is full. Go for next window. */
41652 return true;
41655 num_restrict = count_num_restricted (insn, window_list);
41657 if (num_restrict > num_allowable_groups[group])
41658 return false;
41660 /* See if it fits in the first window. */
41661 if (window_list->window_num == 0)
41663 /* The first widow should have only single and double path
41664 uops. */
41665 if (path == path_double
41666 && (window_list->num_uops + 2) > MAX_INSN)
41667 return false;
41668 else if (path != path_single)
41669 return false;
41671 return true;
41674 /* Add an instruction INSN with NUM_UOPS micro-operations to the
41675 dispatch window WINDOW_LIST. */
41677 static void
41678 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
41680 int byte_len = min_insn_size (insn);
41681 int num_insn = window_list->num_insn;
41682 int imm_size;
41683 sched_insn_info *window = window_list->window;
41684 enum dispatch_group group = get_insn_group (insn);
41685 enum insn_path path = get_insn_path (insn);
41686 int num_imm_operand;
41687 int num_imm32_operand;
41688 int num_imm64_operand;
41690 if (!window_list->violation && group != disp_cmp
41691 && !fits_dispatch_window (insn))
41692 window_list->violation = true;
41694 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
41695 &num_imm64_operand);
41697 /* Initialize window with new instruction. */
41698 window[num_insn].insn = insn;
41699 window[num_insn].byte_len = byte_len;
41700 window[num_insn].group = group;
41701 window[num_insn].path = path;
41702 window[num_insn].imm_bytes = imm_size;
41704 window_list->window_size += byte_len;
41705 window_list->num_insn = num_insn + 1;
41706 window_list->num_uops = window_list->num_uops + num_uops;
41707 window_list->imm_size += imm_size;
41708 window_list->num_imm += num_imm_operand;
41709 window_list->num_imm_32 += num_imm32_operand;
41710 window_list->num_imm_64 += num_imm64_operand;
41712 if (group == disp_store)
41713 window_list->num_stores += 1;
41714 else if (group == disp_load
41715 || group == disp_prefetch)
41716 window_list->num_loads += 1;
41717 else if (group == disp_load_store)
41719 window_list->num_stores += 1;
41720 window_list->num_loads += 1;
41724 /* Adds a scheduled instruction, INSN, to the current dispatch window.
41725 If the total bytes of instructions or the number of instructions in
41726 the window exceed allowable, it allocates a new window. */
41728 static void
41729 add_to_dispatch_window (rtx insn)
41731 int byte_len;
41732 dispatch_windows *window_list;
41733 dispatch_windows *next_list;
41734 dispatch_windows *window0_list;
41735 enum insn_path path;
41736 enum dispatch_group insn_group;
41737 bool insn_fits;
41738 int num_insn;
41739 int num_uops;
41740 int window_num;
41741 int insn_num_uops;
41742 int sum;
41744 if (INSN_CODE (insn) < 0)
41745 return;
41747 byte_len = min_insn_size (insn);
41748 window_list = dispatch_window_list;
41749 next_list = window_list->next;
41750 path = get_insn_path (insn);
41751 insn_group = get_insn_group (insn);
41753 /* Get the last dispatch window. */
41754 if (next_list)
41755 window_list = dispatch_window_list->next;
41757 if (path == path_single)
41758 insn_num_uops = 1;
41759 else if (path == path_double)
41760 insn_num_uops = 2;
41761 else
41762 insn_num_uops = (int) path;
41764 /* If current window is full, get a new window.
41765 Window number zero is full, if MAX_INSN uops are scheduled in it.
41766 Window number one is full, if window zero's bytes plus window
41767 one's bytes is 32, or if the bytes of the new instruction added
41768 to the total makes it greater than 48, or it has already MAX_INSN
41769 instructions in it. */
41770 num_insn = window_list->num_insn;
41771 num_uops = window_list->num_uops;
41772 window_num = window_list->window_num;
41773 insn_fits = fits_dispatch_window (insn);
41775 if (num_insn >= MAX_INSN
41776 || num_uops + insn_num_uops > MAX_INSN
41777 || !(insn_fits))
41779 window_num = ~window_num & 1;
41780 window_list = allocate_next_window (window_num);
41783 if (window_num == 0)
41785 add_insn_window (insn, window_list, insn_num_uops);
41786 if (window_list->num_insn >= MAX_INSN
41787 && insn_group == disp_branch)
41789 process_end_window ();
41790 return;
41793 else if (window_num == 1)
41795 window0_list = window_list->prev;
41796 sum = window0_list->window_size + window_list->window_size;
41797 if (sum == 32
41798 || (byte_len + sum) >= 48)
41800 process_end_window ();
41801 window_list = dispatch_window_list;
41804 add_insn_window (insn, window_list, insn_num_uops);
41806 else
41807 gcc_unreachable ();
41809 if (is_end_basic_block (insn_group))
41811 /* End of basic block is reached do end-basic-block process. */
41812 process_end_window ();
41813 return;
41817 /* Print the dispatch window, WINDOW_NUM, to FILE. */
41819 DEBUG_FUNCTION static void
41820 debug_dispatch_window_file (FILE *file, int window_num)
41822 dispatch_windows *list;
41823 int i;
41825 if (window_num == 0)
41826 list = dispatch_window_list;
41827 else
41828 list = dispatch_window_list1;
41830 fprintf (file, "Window #%d:\n", list->window_num);
41831 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
41832 list->num_insn, list->num_uops, list->window_size);
41833 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
41834 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
41836 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
41837 list->num_stores);
41838 fprintf (file, " insn info:\n");
41840 for (i = 0; i < MAX_INSN; i++)
41842 if (!list->window[i].insn)
41843 break;
41844 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
41845 i, group_name[list->window[i].group],
41846 i, (void *)list->window[i].insn,
41847 i, list->window[i].path,
41848 i, list->window[i].byte_len,
41849 i, list->window[i].imm_bytes);
41853 /* Print to stdout a dispatch window. */
41855 DEBUG_FUNCTION void
41856 debug_dispatch_window (int window_num)
41858 debug_dispatch_window_file (stdout, window_num);
41861 /* Print INSN dispatch information to FILE. */
41863 DEBUG_FUNCTION static void
41864 debug_insn_dispatch_info_file (FILE *file, rtx insn)
41866 int byte_len;
41867 enum insn_path path;
41868 enum dispatch_group group;
41869 int imm_size;
41870 int num_imm_operand;
41871 int num_imm32_operand;
41872 int num_imm64_operand;
41874 if (INSN_CODE (insn) < 0)
41875 return;
41877 byte_len = min_insn_size (insn);
41878 path = get_insn_path (insn);
41879 group = get_insn_group (insn);
41880 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
41881 &num_imm64_operand);
41883 fprintf (file, " insn info:\n");
41884 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
41885 group_name[group], path, byte_len);
41886 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
41887 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
41890 /* Print to STDERR the status of the ready list with respect to
41891 dispatch windows. */
41893 DEBUG_FUNCTION void
41894 debug_ready_dispatch (void)
41896 int i;
41897 int no_ready = number_in_ready ();
41899 fprintf (stdout, "Number of ready: %d\n", no_ready);
41901 for (i = 0; i < no_ready; i++)
41902 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
41905 /* This routine is the driver of the dispatch scheduler. */
41907 static void
41908 do_dispatch (rtx insn, int mode)
41910 if (mode == DISPATCH_INIT)
41911 init_dispatch_sched ();
41912 else if (mode == ADD_TO_DISPATCH_WINDOW)
41913 add_to_dispatch_window (insn);
41916 /* Return TRUE if Dispatch Scheduling is supported. */
41918 static bool
41919 has_dispatch (rtx insn, int action)
41921 if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3)
41922 && flag_dispatch_scheduler)
41923 switch (action)
41925 default:
41926 return false;
41928 case IS_DISPATCH_ON:
41929 return true;
41930 break;
41932 case IS_CMP:
41933 return is_cmp (insn);
41935 case DISPATCH_VIOLATION:
41936 return dispatch_violation ();
41938 case FITS_DISPATCH_WINDOW:
41939 return fits_dispatch_window (insn);
41942 return false;
41945 /* Implementation of reassociation_width target hook used by
41946 reassoc phase to identify parallelism level in reassociated
41947 tree. Statements tree_code is passed in OPC. Arguments type
41948 is passed in MODE.
41950 Currently parallel reassociation is enabled for Atom
41951 processors only and we set reassociation width to be 2
41952 because Atom may issue up to 2 instructions per cycle.
41954 Return value should be fixed if parallel reassociation is
41955 enabled for other processors. */
41957 static int
41958 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
41959 enum machine_mode mode)
41961 int res = 1;
41963 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
41964 res = 2;
41965 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
41966 res = 2;
41968 return res;
41971 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
41972 place emms and femms instructions. */
41974 static enum machine_mode
41975 ix86_preferred_simd_mode (enum machine_mode mode)
41977 if (!TARGET_SSE)
41978 return word_mode;
41980 switch (mode)
41982 case QImode:
41983 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
41984 case HImode:
41985 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
41986 case SImode:
41987 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
41988 case DImode:
41989 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
41991 case SFmode:
41992 if (TARGET_AVX && !TARGET_PREFER_AVX128)
41993 return V8SFmode;
41994 else
41995 return V4SFmode;
41997 case DFmode:
41998 if (!TARGET_VECTORIZE_DOUBLE)
41999 return word_mode;
42000 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
42001 return V4DFmode;
42002 else if (TARGET_SSE2)
42003 return V2DFmode;
42004 /* FALLTHRU */
42006 default:
42007 return word_mode;
42011 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
42012 vectors. */
42014 static unsigned int
42015 ix86_autovectorize_vector_sizes (void)
42017 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
42022 /* Return class of registers which could be used for pseudo of MODE
42023 and of class RCLASS for spilling instead of memory. Return NO_REGS
42024 if it is not possible or non-profitable. */
42025 static reg_class_t
42026 ix86_spill_class (reg_class_t rclass, enum machine_mode mode)
42028 if (TARGET_SSE && TARGET_GENERAL_REGS_SSE_SPILL && ! TARGET_MMX
42029 && hard_reg_set_subset_p (reg_class_contents[rclass],
42030 reg_class_contents[GENERAL_REGS])
42031 && (mode == SImode || (TARGET_64BIT && mode == DImode)))
42032 return SSE_REGS;
42033 return NO_REGS;
42036 /* Implement targetm.vectorize.init_cost. */
42038 static void *
42039 ix86_init_cost (struct loop *loop_info ATTRIBUTE_UNUSED)
42041 unsigned *cost = XNEWVEC (unsigned, 3);
42042 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
42043 return cost;
42046 /* Implement targetm.vectorize.add_stmt_cost. */
42048 static unsigned
42049 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
42050 struct _stmt_vec_info *stmt_info, int misalign,
42051 enum vect_cost_model_location where)
42053 unsigned *cost = (unsigned *) data;
42054 unsigned retval = 0;
42056 if (flag_vect_cost_model)
42058 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
42059 int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
42061 /* Statements in an inner loop relative to the loop being
42062 vectorized are weighted more heavily. The value here is
42063 arbitrary and could potentially be improved with analysis. */
42064 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
42065 count *= 50; /* FIXME. */
42067 retval = (unsigned) (count * stmt_cost);
42068 cost[where] += retval;
42071 return retval;
42074 /* Implement targetm.vectorize.finish_cost. */
42076 static void
42077 ix86_finish_cost (void *data, unsigned *prologue_cost,
42078 unsigned *body_cost, unsigned *epilogue_cost)
42080 unsigned *cost = (unsigned *) data;
42081 *prologue_cost = cost[vect_prologue];
42082 *body_cost = cost[vect_body];
42083 *epilogue_cost = cost[vect_epilogue];
42086 /* Implement targetm.vectorize.destroy_cost_data. */
42088 static void
42089 ix86_destroy_cost_data (void *data)
42091 free (data);
42094 /* Validate target specific memory model bits in VAL. */
42096 static unsigned HOST_WIDE_INT
42097 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
42099 unsigned HOST_WIDE_INT model = val & MEMMODEL_MASK;
42100 bool strong;
42102 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
42103 |MEMMODEL_MASK)
42104 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
42106 warning (OPT_Winvalid_memory_model,
42107 "Unknown architecture specific memory model");
42108 return MEMMODEL_SEQ_CST;
42110 strong = (model == MEMMODEL_ACQ_REL || model == MEMMODEL_SEQ_CST);
42111 if (val & IX86_HLE_ACQUIRE && !(model == MEMMODEL_ACQUIRE || strong))
42113 warning (OPT_Winvalid_memory_model,
42114 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
42115 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
42117 if (val & IX86_HLE_RELEASE && !(model == MEMMODEL_RELEASE || strong))
42119 warning (OPT_Winvalid_memory_model,
42120 "HLE_RELEASE not used with RELEASE or stronger memory model");
42121 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
42123 return val;
42126 /* Initialize the GCC target structure. */
42127 #undef TARGET_RETURN_IN_MEMORY
42128 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
42130 #undef TARGET_LEGITIMIZE_ADDRESS
42131 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
42133 #undef TARGET_ATTRIBUTE_TABLE
42134 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
42135 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
42136 # undef TARGET_MERGE_DECL_ATTRIBUTES
42137 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
42138 #endif
42140 #undef TARGET_COMP_TYPE_ATTRIBUTES
42141 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
42143 #undef TARGET_INIT_BUILTINS
42144 #define TARGET_INIT_BUILTINS ix86_init_builtins
42145 #undef TARGET_BUILTIN_DECL
42146 #define TARGET_BUILTIN_DECL ix86_builtin_decl
42147 #undef TARGET_EXPAND_BUILTIN
42148 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
42150 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
42151 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
42152 ix86_builtin_vectorized_function
42154 #undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
42155 #define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
42157 #undef TARGET_VECTORIZE_BUILTIN_TM_STORE
42158 #define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
42160 #undef TARGET_VECTORIZE_BUILTIN_GATHER
42161 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
42163 #undef TARGET_BUILTIN_RECIPROCAL
42164 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
42166 #undef TARGET_ASM_FUNCTION_EPILOGUE
42167 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
42169 #undef TARGET_ENCODE_SECTION_INFO
42170 #ifndef SUBTARGET_ENCODE_SECTION_INFO
42171 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
42172 #else
42173 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
42174 #endif
42176 #undef TARGET_ASM_OPEN_PAREN
42177 #define TARGET_ASM_OPEN_PAREN ""
42178 #undef TARGET_ASM_CLOSE_PAREN
42179 #define TARGET_ASM_CLOSE_PAREN ""
42181 #undef TARGET_ASM_BYTE_OP
42182 #define TARGET_ASM_BYTE_OP ASM_BYTE
42184 #undef TARGET_ASM_ALIGNED_HI_OP
42185 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
42186 #undef TARGET_ASM_ALIGNED_SI_OP
42187 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
42188 #ifdef ASM_QUAD
42189 #undef TARGET_ASM_ALIGNED_DI_OP
42190 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
42191 #endif
42193 #undef TARGET_PROFILE_BEFORE_PROLOGUE
42194 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
42196 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
42197 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
42199 #undef TARGET_ASM_UNALIGNED_HI_OP
42200 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
42201 #undef TARGET_ASM_UNALIGNED_SI_OP
42202 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
42203 #undef TARGET_ASM_UNALIGNED_DI_OP
42204 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
42206 #undef TARGET_PRINT_OPERAND
42207 #define TARGET_PRINT_OPERAND ix86_print_operand
42208 #undef TARGET_PRINT_OPERAND_ADDRESS
42209 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
42210 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
42211 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
42212 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
42213 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
42215 #undef TARGET_SCHED_INIT_GLOBAL
42216 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
42217 #undef TARGET_SCHED_ADJUST_COST
42218 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
42219 #undef TARGET_SCHED_ISSUE_RATE
42220 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
42221 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
42222 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
42223 ia32_multipass_dfa_lookahead
42225 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
42226 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
42228 #undef TARGET_MEMMODEL_CHECK
42229 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
42231 #ifdef HAVE_AS_TLS
42232 #undef TARGET_HAVE_TLS
42233 #define TARGET_HAVE_TLS true
42234 #endif
42235 #undef TARGET_CANNOT_FORCE_CONST_MEM
42236 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
42237 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
42238 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
42240 #undef TARGET_DELEGITIMIZE_ADDRESS
42241 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
42243 #undef TARGET_MS_BITFIELD_LAYOUT_P
42244 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
42246 #if TARGET_MACHO
42247 #undef TARGET_BINDS_LOCAL_P
42248 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
42249 #endif
42250 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
42251 #undef TARGET_BINDS_LOCAL_P
42252 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
42253 #endif
42255 #undef TARGET_ASM_OUTPUT_MI_THUNK
42256 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
42257 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
42258 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
42260 #undef TARGET_ASM_FILE_START
42261 #define TARGET_ASM_FILE_START x86_file_start
42263 #undef TARGET_OPTION_OVERRIDE
42264 #define TARGET_OPTION_OVERRIDE ix86_option_override
42266 #undef TARGET_REGISTER_MOVE_COST
42267 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
42268 #undef TARGET_MEMORY_MOVE_COST
42269 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
42270 #undef TARGET_RTX_COSTS
42271 #define TARGET_RTX_COSTS ix86_rtx_costs
42272 #undef TARGET_ADDRESS_COST
42273 #define TARGET_ADDRESS_COST ix86_address_cost
42275 #undef TARGET_FIXED_CONDITION_CODE_REGS
42276 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
42277 #undef TARGET_CC_MODES_COMPATIBLE
42278 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
42280 #undef TARGET_MACHINE_DEPENDENT_REORG
42281 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
42283 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
42284 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
42286 #undef TARGET_BUILD_BUILTIN_VA_LIST
42287 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
42289 #undef TARGET_FOLD_BUILTIN
42290 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
42292 #undef TARGET_COMPARE_VERSION_PRIORITY
42293 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
42295 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
42296 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
42297 ix86_generate_version_dispatcher_body
42299 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
42300 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
42301 ix86_get_function_versions_dispatcher
42303 #undef TARGET_ENUM_VA_LIST_P
42304 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
42306 #undef TARGET_FN_ABI_VA_LIST
42307 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
42309 #undef TARGET_CANONICAL_VA_LIST_TYPE
42310 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
42312 #undef TARGET_EXPAND_BUILTIN_VA_START
42313 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
42315 #undef TARGET_MD_ASM_CLOBBERS
42316 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
42318 #undef TARGET_PROMOTE_PROTOTYPES
42319 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
42320 #undef TARGET_STRUCT_VALUE_RTX
42321 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
42322 #undef TARGET_SETUP_INCOMING_VARARGS
42323 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
42324 #undef TARGET_MUST_PASS_IN_STACK
42325 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
42326 #undef TARGET_FUNCTION_ARG_ADVANCE
42327 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
42328 #undef TARGET_FUNCTION_ARG
42329 #define TARGET_FUNCTION_ARG ix86_function_arg
42330 #undef TARGET_FUNCTION_ARG_BOUNDARY
42331 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
42332 #undef TARGET_PASS_BY_REFERENCE
42333 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
42334 #undef TARGET_INTERNAL_ARG_POINTER
42335 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
42336 #undef TARGET_UPDATE_STACK_BOUNDARY
42337 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
42338 #undef TARGET_GET_DRAP_RTX
42339 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
42340 #undef TARGET_STRICT_ARGUMENT_NAMING
42341 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
42342 #undef TARGET_STATIC_CHAIN
42343 #define TARGET_STATIC_CHAIN ix86_static_chain
42344 #undef TARGET_TRAMPOLINE_INIT
42345 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
42346 #undef TARGET_RETURN_POPS_ARGS
42347 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
42349 #undef TARGET_LEGITIMATE_COMBINED_INSN
42350 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
42352 #undef TARGET_ASAN_SHADOW_OFFSET
42353 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
42355 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
42356 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
42358 #undef TARGET_SCALAR_MODE_SUPPORTED_P
42359 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
42361 #undef TARGET_VECTOR_MODE_SUPPORTED_P
42362 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
42364 #undef TARGET_C_MODE_FOR_SUFFIX
42365 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
42367 #ifdef HAVE_AS_TLS
42368 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
42369 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
42370 #endif
42372 #ifdef SUBTARGET_INSERT_ATTRIBUTES
42373 #undef TARGET_INSERT_ATTRIBUTES
42374 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
42375 #endif
42377 #undef TARGET_MANGLE_TYPE
42378 #define TARGET_MANGLE_TYPE ix86_mangle_type
42380 #if !TARGET_MACHO
42381 #undef TARGET_STACK_PROTECT_FAIL
42382 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
42383 #endif
42385 #undef TARGET_FUNCTION_VALUE
42386 #define TARGET_FUNCTION_VALUE ix86_function_value
42388 #undef TARGET_FUNCTION_VALUE_REGNO_P
42389 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
42391 #undef TARGET_PROMOTE_FUNCTION_MODE
42392 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
42394 #undef TARGET_MEMBER_TYPE_FORCES_BLK
42395 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
42397 #undef TARGET_INSTANTIATE_DECLS
42398 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
42400 #undef TARGET_SECONDARY_RELOAD
42401 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
42403 #undef TARGET_CLASS_MAX_NREGS
42404 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
42406 #undef TARGET_PREFERRED_RELOAD_CLASS
42407 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
42408 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
42409 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
42410 #undef TARGET_CLASS_LIKELY_SPILLED_P
42411 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
42413 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
42414 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
42415 ix86_builtin_vectorization_cost
42416 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
42417 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
42418 ix86_vectorize_vec_perm_const_ok
42419 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
42420 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
42421 ix86_preferred_simd_mode
42422 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
42423 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
42424 ix86_autovectorize_vector_sizes
42425 #undef TARGET_VECTORIZE_INIT_COST
42426 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
42427 #undef TARGET_VECTORIZE_ADD_STMT_COST
42428 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
42429 #undef TARGET_VECTORIZE_FINISH_COST
42430 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
42431 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
42432 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
42434 #undef TARGET_SET_CURRENT_FUNCTION
42435 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
42437 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
42438 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
42440 #undef TARGET_OPTION_SAVE
42441 #define TARGET_OPTION_SAVE ix86_function_specific_save
42443 #undef TARGET_OPTION_RESTORE
42444 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
42446 #undef TARGET_OPTION_PRINT
42447 #define TARGET_OPTION_PRINT ix86_function_specific_print
42449 #undef TARGET_OPTION_FUNCTION_VERSIONS
42450 #define TARGET_OPTION_FUNCTION_VERSIONS ix86_function_versions
42452 #undef TARGET_OPTION_SUPPORTS_FUNCTION_VERSIONS
42453 #define TARGET_OPTION_SUPPORTS_FUNCTION_VERSIONS \
42454 ix86_supports_function_versions
42456 #undef TARGET_CAN_INLINE_P
42457 #define TARGET_CAN_INLINE_P ix86_can_inline_p
42459 #undef TARGET_EXPAND_TO_RTL_HOOK
42460 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
42462 #undef TARGET_LEGITIMATE_ADDRESS_P
42463 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
42465 #undef TARGET_LRA_P
42466 #define TARGET_LRA_P hook_bool_void_true
42468 #undef TARGET_REGISTER_PRIORITY
42469 #define TARGET_REGISTER_PRIORITY ix86_register_priority
42471 #undef TARGET_LEGITIMATE_CONSTANT_P
42472 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
42474 #undef TARGET_FRAME_POINTER_REQUIRED
42475 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
42477 #undef TARGET_CAN_ELIMINATE
42478 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
42480 #undef TARGET_EXTRA_LIVE_ON_ENTRY
42481 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
42483 #undef TARGET_ASM_CODE_END
42484 #define TARGET_ASM_CODE_END ix86_code_end
42486 #undef TARGET_CONDITIONAL_REGISTER_USAGE
42487 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
42489 #if TARGET_MACHO
42490 #undef TARGET_INIT_LIBFUNCS
42491 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
42492 #endif
42494 #undef TARGET_SPILL_CLASS
42495 #define TARGET_SPILL_CLASS ix86_spill_class
42497 struct gcc_target targetm = TARGET_INITIALIZER;
42499 #include "gt-i386.h"