Merge trunk version 195937 into gupc branch.
[official-gcc.git] / gcc / config / i386 / i386.c
blob98f8bee3dda1234546f9b797e8a8f0b523dac343
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2013 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tm.h"
24 #include "rtl.h"
25 #include "tree.h"
26 #include "tm_p.h"
27 #include "regs.h"
28 #include "hard-reg-set.h"
29 #include "insn-config.h"
30 #include "conditions.h"
31 #include "output.h"
32 #include "insn-codes.h"
33 #include "insn-attr.h"
34 #include "flags.h"
35 #include "except.h"
36 #include "function.h"
37 #include "recog.h"
38 #include "expr.h"
39 #include "optabs.h"
40 #include "diagnostic-core.h"
41 #include "toplev.h"
42 #include "basic-block.h"
43 #include "ggc.h"
44 #include "target.h"
45 #include "target-def.h"
46 #include "common/common-target.h"
47 #include "langhooks.h"
48 #include "reload.h"
49 #include "cgraph.h"
50 #include "gimple.h"
51 #include "dwarf2.h"
52 #include "df.h"
53 #include "tm-constrs.h"
54 #include "params.h"
55 #include "cselib.h"
56 #include "debug.h"
57 #include "sched-int.h"
58 #include "sbitmap.h"
59 #include "fibheap.h"
60 #include "opts.h"
61 #include "diagnostic.h"
62 #include "dumpfile.h"
63 #include "tree-pass.h"
64 #include "tree-flow.h"
66 static rtx legitimize_dllimport_symbol (rtx, bool);
68 #ifndef CHECK_STACK_LIMIT
69 #define CHECK_STACK_LIMIT (-1)
70 #endif
72 /* Return index of given mode in mult and division cost tables. */
73 #define MODE_INDEX(mode) \
74 ((mode) == QImode ? 0 \
75 : (mode) == HImode ? 1 \
76 : (mode) == SImode ? 2 \
77 : (mode) == DImode ? 3 \
78 : 4)
80 /* Processor costs (relative to an add) */
81 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
82 #define COSTS_N_BYTES(N) ((N) * 2)
84 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
86 const
87 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
88 COSTS_N_BYTES (2), /* cost of an add instruction */
89 COSTS_N_BYTES (3), /* cost of a lea instruction */
90 COSTS_N_BYTES (2), /* variable shift costs */
91 COSTS_N_BYTES (3), /* constant shift costs */
92 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
93 COSTS_N_BYTES (3), /* HI */
94 COSTS_N_BYTES (3), /* SI */
95 COSTS_N_BYTES (3), /* DI */
96 COSTS_N_BYTES (5)}, /* other */
97 0, /* cost of multiply per each bit set */
98 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
99 COSTS_N_BYTES (3), /* HI */
100 COSTS_N_BYTES (3), /* SI */
101 COSTS_N_BYTES (3), /* DI */
102 COSTS_N_BYTES (5)}, /* other */
103 COSTS_N_BYTES (3), /* cost of movsx */
104 COSTS_N_BYTES (3), /* cost of movzx */
105 0, /* "large" insn */
106 2, /* MOVE_RATIO */
107 2, /* cost for loading QImode using movzbl */
108 {2, 2, 2}, /* cost of loading integer registers
109 in QImode, HImode and SImode.
110 Relative to reg-reg move (2). */
111 {2, 2, 2}, /* cost of storing integer registers */
112 2, /* cost of reg,reg fld/fst */
113 {2, 2, 2}, /* cost of loading fp registers
114 in SFmode, DFmode and XFmode */
115 {2, 2, 2}, /* cost of storing fp registers
116 in SFmode, DFmode and XFmode */
117 3, /* cost of moving MMX register */
118 {3, 3}, /* cost of loading MMX registers
119 in SImode and DImode */
120 {3, 3}, /* cost of storing MMX registers
121 in SImode and DImode */
122 3, /* cost of moving SSE register */
123 {3, 3, 3}, /* cost of loading SSE registers
124 in SImode, DImode and TImode */
125 {3, 3, 3}, /* cost of storing SSE registers
126 in SImode, DImode and TImode */
127 3, /* MMX or SSE register to integer */
128 0, /* size of l1 cache */
129 0, /* size of l2 cache */
130 0, /* size of prefetch block */
131 0, /* number of parallel prefetches */
132 2, /* Branch cost */
133 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
134 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
135 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
136 COSTS_N_BYTES (2), /* cost of FABS instruction. */
137 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
138 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
139 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
140 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}},
141 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
142 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}},
143 1, /* scalar_stmt_cost. */
144 1, /* scalar load_cost. */
145 1, /* scalar_store_cost. */
146 1, /* vec_stmt_cost. */
147 1, /* vec_to_scalar_cost. */
148 1, /* scalar_to_vec_cost. */
149 1, /* vec_align_load_cost. */
150 1, /* vec_unalign_load_cost. */
151 1, /* vec_store_cost. */
152 1, /* cond_taken_branch_cost. */
153 1, /* cond_not_taken_branch_cost. */
156 /* Processor costs (relative to an add) */
157 static const
158 struct processor_costs i386_cost = { /* 386 specific costs */
159 COSTS_N_INSNS (1), /* cost of an add instruction */
160 COSTS_N_INSNS (1), /* cost of a lea instruction */
161 COSTS_N_INSNS (3), /* variable shift costs */
162 COSTS_N_INSNS (2), /* constant shift costs */
163 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
164 COSTS_N_INSNS (6), /* HI */
165 COSTS_N_INSNS (6), /* SI */
166 COSTS_N_INSNS (6), /* DI */
167 COSTS_N_INSNS (6)}, /* other */
168 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
169 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
170 COSTS_N_INSNS (23), /* HI */
171 COSTS_N_INSNS (23), /* SI */
172 COSTS_N_INSNS (23), /* DI */
173 COSTS_N_INSNS (23)}, /* other */
174 COSTS_N_INSNS (3), /* cost of movsx */
175 COSTS_N_INSNS (2), /* cost of movzx */
176 15, /* "large" insn */
177 3, /* MOVE_RATIO */
178 4, /* cost for loading QImode using movzbl */
179 {2, 4, 2}, /* cost of loading integer registers
180 in QImode, HImode and SImode.
181 Relative to reg-reg move (2). */
182 {2, 4, 2}, /* cost of storing integer registers */
183 2, /* cost of reg,reg fld/fst */
184 {8, 8, 8}, /* cost of loading fp registers
185 in SFmode, DFmode and XFmode */
186 {8, 8, 8}, /* cost of storing fp registers
187 in SFmode, DFmode and XFmode */
188 2, /* cost of moving MMX register */
189 {4, 8}, /* cost of loading MMX registers
190 in SImode and DImode */
191 {4, 8}, /* cost of storing MMX registers
192 in SImode and DImode */
193 2, /* cost of moving SSE register */
194 {4, 8, 16}, /* cost of loading SSE registers
195 in SImode, DImode and TImode */
196 {4, 8, 16}, /* cost of storing SSE registers
197 in SImode, DImode and TImode */
198 3, /* MMX or SSE register to integer */
199 0, /* size of l1 cache */
200 0, /* size of l2 cache */
201 0, /* size of prefetch block */
202 0, /* number of parallel prefetches */
203 1, /* Branch cost */
204 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
205 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
206 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
207 COSTS_N_INSNS (22), /* cost of FABS instruction. */
208 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
209 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
210 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
211 DUMMY_STRINGOP_ALGS},
212 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
213 DUMMY_STRINGOP_ALGS},
214 1, /* scalar_stmt_cost. */
215 1, /* scalar load_cost. */
216 1, /* scalar_store_cost. */
217 1, /* vec_stmt_cost. */
218 1, /* vec_to_scalar_cost. */
219 1, /* scalar_to_vec_cost. */
220 1, /* vec_align_load_cost. */
221 2, /* vec_unalign_load_cost. */
222 1, /* vec_store_cost. */
223 3, /* cond_taken_branch_cost. */
224 1, /* cond_not_taken_branch_cost. */
227 static const
228 struct processor_costs i486_cost = { /* 486 specific costs */
229 COSTS_N_INSNS (1), /* cost of an add instruction */
230 COSTS_N_INSNS (1), /* cost of a lea instruction */
231 COSTS_N_INSNS (3), /* variable shift costs */
232 COSTS_N_INSNS (2), /* constant shift costs */
233 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
234 COSTS_N_INSNS (12), /* HI */
235 COSTS_N_INSNS (12), /* SI */
236 COSTS_N_INSNS (12), /* DI */
237 COSTS_N_INSNS (12)}, /* other */
238 1, /* cost of multiply per each bit set */
239 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
240 COSTS_N_INSNS (40), /* HI */
241 COSTS_N_INSNS (40), /* SI */
242 COSTS_N_INSNS (40), /* DI */
243 COSTS_N_INSNS (40)}, /* other */
244 COSTS_N_INSNS (3), /* cost of movsx */
245 COSTS_N_INSNS (2), /* cost of movzx */
246 15, /* "large" insn */
247 3, /* MOVE_RATIO */
248 4, /* cost for loading QImode using movzbl */
249 {2, 4, 2}, /* cost of loading integer registers
250 in QImode, HImode and SImode.
251 Relative to reg-reg move (2). */
252 {2, 4, 2}, /* cost of storing integer registers */
253 2, /* cost of reg,reg fld/fst */
254 {8, 8, 8}, /* cost of loading fp registers
255 in SFmode, DFmode and XFmode */
256 {8, 8, 8}, /* cost of storing fp registers
257 in SFmode, DFmode and XFmode */
258 2, /* cost of moving MMX register */
259 {4, 8}, /* cost of loading MMX registers
260 in SImode and DImode */
261 {4, 8}, /* cost of storing MMX registers
262 in SImode and DImode */
263 2, /* cost of moving SSE register */
264 {4, 8, 16}, /* cost of loading SSE registers
265 in SImode, DImode and TImode */
266 {4, 8, 16}, /* cost of storing SSE registers
267 in SImode, DImode and TImode */
268 3, /* MMX or SSE register to integer */
269 4, /* size of l1 cache. 486 has 8kB cache
270 shared for code and data, so 4kB is
271 not really precise. */
272 4, /* size of l2 cache */
273 0, /* size of prefetch block */
274 0, /* number of parallel prefetches */
275 1, /* Branch cost */
276 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
277 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
278 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
279 COSTS_N_INSNS (3), /* cost of FABS instruction. */
280 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
281 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
282 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
283 DUMMY_STRINGOP_ALGS},
284 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
285 DUMMY_STRINGOP_ALGS},
286 1, /* scalar_stmt_cost. */
287 1, /* scalar load_cost. */
288 1, /* scalar_store_cost. */
289 1, /* vec_stmt_cost. */
290 1, /* vec_to_scalar_cost. */
291 1, /* scalar_to_vec_cost. */
292 1, /* vec_align_load_cost. */
293 2, /* vec_unalign_load_cost. */
294 1, /* vec_store_cost. */
295 3, /* cond_taken_branch_cost. */
296 1, /* cond_not_taken_branch_cost. */
299 static const
300 struct processor_costs pentium_cost = {
301 COSTS_N_INSNS (1), /* cost of an add instruction */
302 COSTS_N_INSNS (1), /* cost of a lea instruction */
303 COSTS_N_INSNS (4), /* variable shift costs */
304 COSTS_N_INSNS (1), /* constant shift costs */
305 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
306 COSTS_N_INSNS (11), /* HI */
307 COSTS_N_INSNS (11), /* SI */
308 COSTS_N_INSNS (11), /* DI */
309 COSTS_N_INSNS (11)}, /* other */
310 0, /* cost of multiply per each bit set */
311 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
312 COSTS_N_INSNS (25), /* HI */
313 COSTS_N_INSNS (25), /* SI */
314 COSTS_N_INSNS (25), /* DI */
315 COSTS_N_INSNS (25)}, /* other */
316 COSTS_N_INSNS (3), /* cost of movsx */
317 COSTS_N_INSNS (2), /* cost of movzx */
318 8, /* "large" insn */
319 6, /* MOVE_RATIO */
320 6, /* cost for loading QImode using movzbl */
321 {2, 4, 2}, /* cost of loading integer registers
322 in QImode, HImode and SImode.
323 Relative to reg-reg move (2). */
324 {2, 4, 2}, /* cost of storing integer registers */
325 2, /* cost of reg,reg fld/fst */
326 {2, 2, 6}, /* cost of loading fp registers
327 in SFmode, DFmode and XFmode */
328 {4, 4, 6}, /* cost of storing fp registers
329 in SFmode, DFmode and XFmode */
330 8, /* cost of moving MMX register */
331 {8, 8}, /* cost of loading MMX registers
332 in SImode and DImode */
333 {8, 8}, /* cost of storing MMX registers
334 in SImode and DImode */
335 2, /* cost of moving SSE register */
336 {4, 8, 16}, /* cost of loading SSE registers
337 in SImode, DImode and TImode */
338 {4, 8, 16}, /* cost of storing SSE registers
339 in SImode, DImode and TImode */
340 3, /* MMX or SSE register to integer */
341 8, /* size of l1 cache. */
342 8, /* size of l2 cache */
343 0, /* size of prefetch block */
344 0, /* number of parallel prefetches */
345 2, /* Branch cost */
346 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
347 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
348 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
349 COSTS_N_INSNS (1), /* cost of FABS instruction. */
350 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
351 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
352 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
353 DUMMY_STRINGOP_ALGS},
354 {{libcall, {{-1, rep_prefix_4_byte, false}}},
355 DUMMY_STRINGOP_ALGS},
356 1, /* scalar_stmt_cost. */
357 1, /* scalar load_cost. */
358 1, /* scalar_store_cost. */
359 1, /* vec_stmt_cost. */
360 1, /* vec_to_scalar_cost. */
361 1, /* scalar_to_vec_cost. */
362 1, /* vec_align_load_cost. */
363 2, /* vec_unalign_load_cost. */
364 1, /* vec_store_cost. */
365 3, /* cond_taken_branch_cost. */
366 1, /* cond_not_taken_branch_cost. */
369 static const
370 struct processor_costs pentiumpro_cost = {
371 COSTS_N_INSNS (1), /* cost of an add instruction */
372 COSTS_N_INSNS (1), /* cost of a lea instruction */
373 COSTS_N_INSNS (1), /* variable shift costs */
374 COSTS_N_INSNS (1), /* constant shift costs */
375 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
376 COSTS_N_INSNS (4), /* HI */
377 COSTS_N_INSNS (4), /* SI */
378 COSTS_N_INSNS (4), /* DI */
379 COSTS_N_INSNS (4)}, /* other */
380 0, /* cost of multiply per each bit set */
381 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
382 COSTS_N_INSNS (17), /* HI */
383 COSTS_N_INSNS (17), /* SI */
384 COSTS_N_INSNS (17), /* DI */
385 COSTS_N_INSNS (17)}, /* other */
386 COSTS_N_INSNS (1), /* cost of movsx */
387 COSTS_N_INSNS (1), /* cost of movzx */
388 8, /* "large" insn */
389 6, /* MOVE_RATIO */
390 2, /* cost for loading QImode using movzbl */
391 {4, 4, 4}, /* cost of loading integer registers
392 in QImode, HImode and SImode.
393 Relative to reg-reg move (2). */
394 {2, 2, 2}, /* cost of storing integer registers */
395 2, /* cost of reg,reg fld/fst */
396 {2, 2, 6}, /* cost of loading fp registers
397 in SFmode, DFmode and XFmode */
398 {4, 4, 6}, /* cost of storing fp registers
399 in SFmode, DFmode and XFmode */
400 2, /* cost of moving MMX register */
401 {2, 2}, /* cost of loading MMX registers
402 in SImode and DImode */
403 {2, 2}, /* cost of storing MMX registers
404 in SImode and DImode */
405 2, /* cost of moving SSE register */
406 {2, 2, 8}, /* cost of loading SSE registers
407 in SImode, DImode and TImode */
408 {2, 2, 8}, /* cost of storing SSE registers
409 in SImode, DImode and TImode */
410 3, /* MMX or SSE register to integer */
411 8, /* size of l1 cache. */
412 256, /* size of l2 cache */
413 32, /* size of prefetch block */
414 6, /* number of parallel prefetches */
415 2, /* Branch cost */
416 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
417 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
418 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
419 COSTS_N_INSNS (2), /* cost of FABS instruction. */
420 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
421 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
422 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
423 (we ensure the alignment). For small blocks inline loop is still a
424 noticeable win, for bigger blocks either rep movsl or rep movsb is
425 way to go. Rep movsb has apparently more expensive startup time in CPU,
426 but after 4K the difference is down in the noise. */
427 {{rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
428 {8192, rep_prefix_4_byte, false},
429 {-1, rep_prefix_1_byte, false}}},
430 DUMMY_STRINGOP_ALGS},
431 {{rep_prefix_4_byte, {{1024, unrolled_loop, false},
432 {8192, rep_prefix_4_byte, false},
433 {-1, libcall, false}}},
434 DUMMY_STRINGOP_ALGS},
435 1, /* scalar_stmt_cost. */
436 1, /* scalar load_cost. */
437 1, /* scalar_store_cost. */
438 1, /* vec_stmt_cost. */
439 1, /* vec_to_scalar_cost. */
440 1, /* scalar_to_vec_cost. */
441 1, /* vec_align_load_cost. */
442 2, /* vec_unalign_load_cost. */
443 1, /* vec_store_cost. */
444 3, /* cond_taken_branch_cost. */
445 1, /* cond_not_taken_branch_cost. */
448 static const
449 struct processor_costs geode_cost = {
450 COSTS_N_INSNS (1), /* cost of an add instruction */
451 COSTS_N_INSNS (1), /* cost of a lea instruction */
452 COSTS_N_INSNS (2), /* variable shift costs */
453 COSTS_N_INSNS (1), /* constant shift costs */
454 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
455 COSTS_N_INSNS (4), /* HI */
456 COSTS_N_INSNS (7), /* SI */
457 COSTS_N_INSNS (7), /* DI */
458 COSTS_N_INSNS (7)}, /* other */
459 0, /* cost of multiply per each bit set */
460 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
461 COSTS_N_INSNS (23), /* HI */
462 COSTS_N_INSNS (39), /* SI */
463 COSTS_N_INSNS (39), /* DI */
464 COSTS_N_INSNS (39)}, /* other */
465 COSTS_N_INSNS (1), /* cost of movsx */
466 COSTS_N_INSNS (1), /* cost of movzx */
467 8, /* "large" insn */
468 4, /* MOVE_RATIO */
469 1, /* cost for loading QImode using movzbl */
470 {1, 1, 1}, /* cost of loading integer registers
471 in QImode, HImode and SImode.
472 Relative to reg-reg move (2). */
473 {1, 1, 1}, /* cost of storing integer registers */
474 1, /* cost of reg,reg fld/fst */
475 {1, 1, 1}, /* cost of loading fp registers
476 in SFmode, DFmode and XFmode */
477 {4, 6, 6}, /* cost of storing fp registers
478 in SFmode, DFmode and XFmode */
480 1, /* cost of moving MMX register */
481 {1, 1}, /* cost of loading MMX registers
482 in SImode and DImode */
483 {1, 1}, /* cost of storing MMX registers
484 in SImode and DImode */
485 1, /* cost of moving SSE register */
486 {1, 1, 1}, /* cost of loading SSE registers
487 in SImode, DImode and TImode */
488 {1, 1, 1}, /* cost of storing SSE registers
489 in SImode, DImode and TImode */
490 1, /* MMX or SSE register to integer */
491 64, /* size of l1 cache. */
492 128, /* size of l2 cache. */
493 32, /* size of prefetch block */
494 1, /* number of parallel prefetches */
495 1, /* Branch cost */
496 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
497 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
498 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
499 COSTS_N_INSNS (1), /* cost of FABS instruction. */
500 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
501 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
502 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
503 DUMMY_STRINGOP_ALGS},
504 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
505 DUMMY_STRINGOP_ALGS},
506 1, /* scalar_stmt_cost. */
507 1, /* scalar load_cost. */
508 1, /* scalar_store_cost. */
509 1, /* vec_stmt_cost. */
510 1, /* vec_to_scalar_cost. */
511 1, /* scalar_to_vec_cost. */
512 1, /* vec_align_load_cost. */
513 2, /* vec_unalign_load_cost. */
514 1, /* vec_store_cost. */
515 3, /* cond_taken_branch_cost. */
516 1, /* cond_not_taken_branch_cost. */
519 static const
520 struct processor_costs k6_cost = {
521 COSTS_N_INSNS (1), /* cost of an add instruction */
522 COSTS_N_INSNS (2), /* cost of a lea instruction */
523 COSTS_N_INSNS (1), /* variable shift costs */
524 COSTS_N_INSNS (1), /* constant shift costs */
525 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
526 COSTS_N_INSNS (3), /* HI */
527 COSTS_N_INSNS (3), /* SI */
528 COSTS_N_INSNS (3), /* DI */
529 COSTS_N_INSNS (3)}, /* other */
530 0, /* cost of multiply per each bit set */
531 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
532 COSTS_N_INSNS (18), /* HI */
533 COSTS_N_INSNS (18), /* SI */
534 COSTS_N_INSNS (18), /* DI */
535 COSTS_N_INSNS (18)}, /* other */
536 COSTS_N_INSNS (2), /* cost of movsx */
537 COSTS_N_INSNS (2), /* cost of movzx */
538 8, /* "large" insn */
539 4, /* MOVE_RATIO */
540 3, /* cost for loading QImode using movzbl */
541 {4, 5, 4}, /* cost of loading integer registers
542 in QImode, HImode and SImode.
543 Relative to reg-reg move (2). */
544 {2, 3, 2}, /* cost of storing integer registers */
545 4, /* cost of reg,reg fld/fst */
546 {6, 6, 6}, /* cost of loading fp registers
547 in SFmode, DFmode and XFmode */
548 {4, 4, 4}, /* cost of storing fp registers
549 in SFmode, DFmode and XFmode */
550 2, /* cost of moving MMX register */
551 {2, 2}, /* cost of loading MMX registers
552 in SImode and DImode */
553 {2, 2}, /* cost of storing MMX registers
554 in SImode and DImode */
555 2, /* cost of moving SSE register */
556 {2, 2, 8}, /* cost of loading SSE registers
557 in SImode, DImode and TImode */
558 {2, 2, 8}, /* cost of storing SSE registers
559 in SImode, DImode and TImode */
560 6, /* MMX or SSE register to integer */
561 32, /* size of l1 cache. */
562 32, /* size of l2 cache. Some models
563 have integrated l2 cache, but
564 optimizing for k6 is not important
565 enough to worry about that. */
566 32, /* size of prefetch block */
567 1, /* number of parallel prefetches */
568 1, /* Branch cost */
569 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
570 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
571 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
572 COSTS_N_INSNS (2), /* cost of FABS instruction. */
573 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
574 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
575 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
576 DUMMY_STRINGOP_ALGS},
577 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
578 DUMMY_STRINGOP_ALGS},
579 1, /* scalar_stmt_cost. */
580 1, /* scalar load_cost. */
581 1, /* scalar_store_cost. */
582 1, /* vec_stmt_cost. */
583 1, /* vec_to_scalar_cost. */
584 1, /* scalar_to_vec_cost. */
585 1, /* vec_align_load_cost. */
586 2, /* vec_unalign_load_cost. */
587 1, /* vec_store_cost. */
588 3, /* cond_taken_branch_cost. */
589 1, /* cond_not_taken_branch_cost. */
592 static const
593 struct processor_costs athlon_cost = {
594 COSTS_N_INSNS (1), /* cost of an add instruction */
595 COSTS_N_INSNS (2), /* cost of a lea instruction */
596 COSTS_N_INSNS (1), /* variable shift costs */
597 COSTS_N_INSNS (1), /* constant shift costs */
598 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
599 COSTS_N_INSNS (5), /* HI */
600 COSTS_N_INSNS (5), /* SI */
601 COSTS_N_INSNS (5), /* DI */
602 COSTS_N_INSNS (5)}, /* other */
603 0, /* cost of multiply per each bit set */
604 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
605 COSTS_N_INSNS (26), /* HI */
606 COSTS_N_INSNS (42), /* SI */
607 COSTS_N_INSNS (74), /* DI */
608 COSTS_N_INSNS (74)}, /* other */
609 COSTS_N_INSNS (1), /* cost of movsx */
610 COSTS_N_INSNS (1), /* cost of movzx */
611 8, /* "large" insn */
612 9, /* MOVE_RATIO */
613 4, /* cost for loading QImode using movzbl */
614 {3, 4, 3}, /* cost of loading integer registers
615 in QImode, HImode and SImode.
616 Relative to reg-reg move (2). */
617 {3, 4, 3}, /* cost of storing integer registers */
618 4, /* cost of reg,reg fld/fst */
619 {4, 4, 12}, /* cost of loading fp registers
620 in SFmode, DFmode and XFmode */
621 {6, 6, 8}, /* cost of storing fp registers
622 in SFmode, DFmode and XFmode */
623 2, /* cost of moving MMX register */
624 {4, 4}, /* cost of loading MMX registers
625 in SImode and DImode */
626 {4, 4}, /* cost of storing MMX registers
627 in SImode and DImode */
628 2, /* cost of moving SSE register */
629 {4, 4, 6}, /* cost of loading SSE registers
630 in SImode, DImode and TImode */
631 {4, 4, 5}, /* cost of storing SSE registers
632 in SImode, DImode and TImode */
633 5, /* MMX or SSE register to integer */
634 64, /* size of l1 cache. */
635 256, /* size of l2 cache. */
636 64, /* size of prefetch block */
637 6, /* number of parallel prefetches */
638 5, /* Branch cost */
639 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
640 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
641 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
642 COSTS_N_INSNS (2), /* cost of FABS instruction. */
643 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
644 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
645 /* For some reason, Athlon deals better with REP prefix (relative to loops)
646 compared to K8. Alignment becomes important after 8 bytes for memcpy and
647 128 bytes for memset. */
648 {{libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
649 DUMMY_STRINGOP_ALGS},
650 {{libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
651 DUMMY_STRINGOP_ALGS},
652 1, /* scalar_stmt_cost. */
653 1, /* scalar load_cost. */
654 1, /* scalar_store_cost. */
655 1, /* vec_stmt_cost. */
656 1, /* vec_to_scalar_cost. */
657 1, /* scalar_to_vec_cost. */
658 1, /* vec_align_load_cost. */
659 2, /* vec_unalign_load_cost. */
660 1, /* vec_store_cost. */
661 3, /* cond_taken_branch_cost. */
662 1, /* cond_not_taken_branch_cost. */
665 static const
666 struct processor_costs k8_cost = {
667 COSTS_N_INSNS (1), /* cost of an add instruction */
668 COSTS_N_INSNS (2), /* cost of a lea instruction */
669 COSTS_N_INSNS (1), /* variable shift costs */
670 COSTS_N_INSNS (1), /* constant shift costs */
671 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
672 COSTS_N_INSNS (4), /* HI */
673 COSTS_N_INSNS (3), /* SI */
674 COSTS_N_INSNS (4), /* DI */
675 COSTS_N_INSNS (5)}, /* other */
676 0, /* cost of multiply per each bit set */
677 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
678 COSTS_N_INSNS (26), /* HI */
679 COSTS_N_INSNS (42), /* SI */
680 COSTS_N_INSNS (74), /* DI */
681 COSTS_N_INSNS (74)}, /* other */
682 COSTS_N_INSNS (1), /* cost of movsx */
683 COSTS_N_INSNS (1), /* cost of movzx */
684 8, /* "large" insn */
685 9, /* MOVE_RATIO */
686 4, /* cost for loading QImode using movzbl */
687 {3, 4, 3}, /* cost of loading integer registers
688 in QImode, HImode and SImode.
689 Relative to reg-reg move (2). */
690 {3, 4, 3}, /* cost of storing integer registers */
691 4, /* cost of reg,reg fld/fst */
692 {4, 4, 12}, /* cost of loading fp registers
693 in SFmode, DFmode and XFmode */
694 {6, 6, 8}, /* cost of storing fp registers
695 in SFmode, DFmode and XFmode */
696 2, /* cost of moving MMX register */
697 {3, 3}, /* cost of loading MMX registers
698 in SImode and DImode */
699 {4, 4}, /* cost of storing MMX registers
700 in SImode and DImode */
701 2, /* cost of moving SSE register */
702 {4, 3, 6}, /* cost of loading SSE registers
703 in SImode, DImode and TImode */
704 {4, 4, 5}, /* cost of storing SSE registers
705 in SImode, DImode and TImode */
706 5, /* MMX or SSE register to integer */
707 64, /* size of l1 cache. */
708 512, /* size of l2 cache. */
709 64, /* size of prefetch block */
710 /* New AMD processors never drop prefetches; if they cannot be performed
711 immediately, they are queued. We set number of simultaneous prefetches
712 to a large constant to reflect this (it probably is not a good idea not
713 to limit number of prefetches at all, as their execution also takes some
714 time). */
715 100, /* number of parallel prefetches */
716 3, /* Branch cost */
717 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
718 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
719 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
720 COSTS_N_INSNS (2), /* cost of FABS instruction. */
721 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
722 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
723 /* K8 has optimized REP instruction for medium sized blocks, but for very
724 small blocks it is better to use loop. For large blocks, libcall can
725 do nontemporary accesses and beat inline considerably. */
726 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
727 {-1, rep_prefix_4_byte, false}}},
728 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
729 {-1, libcall, false}}}},
730 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
731 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
732 {libcall, {{48, unrolled_loop, false},
733 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
734 4, /* scalar_stmt_cost. */
735 2, /* scalar load_cost. */
736 2, /* scalar_store_cost. */
737 5, /* vec_stmt_cost. */
738 0, /* vec_to_scalar_cost. */
739 2, /* scalar_to_vec_cost. */
740 2, /* vec_align_load_cost. */
741 3, /* vec_unalign_load_cost. */
742 3, /* vec_store_cost. */
743 3, /* cond_taken_branch_cost. */
744 2, /* cond_not_taken_branch_cost. */
747 struct processor_costs amdfam10_cost = {
748 COSTS_N_INSNS (1), /* cost of an add instruction */
749 COSTS_N_INSNS (2), /* cost of a lea instruction */
750 COSTS_N_INSNS (1), /* variable shift costs */
751 COSTS_N_INSNS (1), /* constant shift costs */
752 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
753 COSTS_N_INSNS (4), /* HI */
754 COSTS_N_INSNS (3), /* SI */
755 COSTS_N_INSNS (4), /* DI */
756 COSTS_N_INSNS (5)}, /* other */
757 0, /* cost of multiply per each bit set */
758 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
759 COSTS_N_INSNS (35), /* HI */
760 COSTS_N_INSNS (51), /* SI */
761 COSTS_N_INSNS (83), /* DI */
762 COSTS_N_INSNS (83)}, /* other */
763 COSTS_N_INSNS (1), /* cost of movsx */
764 COSTS_N_INSNS (1), /* cost of movzx */
765 8, /* "large" insn */
766 9, /* MOVE_RATIO */
767 4, /* cost for loading QImode using movzbl */
768 {3, 4, 3}, /* cost of loading integer registers
769 in QImode, HImode and SImode.
770 Relative to reg-reg move (2). */
771 {3, 4, 3}, /* cost of storing integer registers */
772 4, /* cost of reg,reg fld/fst */
773 {4, 4, 12}, /* cost of loading fp registers
774 in SFmode, DFmode and XFmode */
775 {6, 6, 8}, /* cost of storing fp registers
776 in SFmode, DFmode and XFmode */
777 2, /* cost of moving MMX register */
778 {3, 3}, /* cost of loading MMX registers
779 in SImode and DImode */
780 {4, 4}, /* cost of storing MMX registers
781 in SImode and DImode */
782 2, /* cost of moving SSE register */
783 {4, 4, 3}, /* cost of loading SSE registers
784 in SImode, DImode and TImode */
785 {4, 4, 5}, /* cost of storing SSE registers
786 in SImode, DImode and TImode */
787 3, /* MMX or SSE register to integer */
788 /* On K8:
789 MOVD reg64, xmmreg Double FSTORE 4
790 MOVD reg32, xmmreg Double FSTORE 4
791 On AMDFAM10:
792 MOVD reg64, xmmreg Double FADD 3
793 1/1 1/1
794 MOVD reg32, xmmreg Double FADD 3
795 1/1 1/1 */
796 64, /* size of l1 cache. */
797 512, /* size of l2 cache. */
798 64, /* size of prefetch block */
799 /* New AMD processors never drop prefetches; if they cannot be performed
800 immediately, they are queued. We set number of simultaneous prefetches
801 to a large constant to reflect this (it probably is not a good idea not
802 to limit number of prefetches at all, as their execution also takes some
803 time). */
804 100, /* number of parallel prefetches */
805 2, /* Branch cost */
806 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
807 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
808 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
809 COSTS_N_INSNS (2), /* cost of FABS instruction. */
810 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
811 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
813 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
814 very small blocks it is better to use loop. For large blocks, libcall can
815 do nontemporary accesses and beat inline considerably. */
816 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
817 {-1, rep_prefix_4_byte, false}}},
818 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
819 {-1, libcall, false}}}},
820 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
821 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
822 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
823 {-1, libcall, false}}}},
824 4, /* scalar_stmt_cost. */
825 2, /* scalar load_cost. */
826 2, /* scalar_store_cost. */
827 6, /* vec_stmt_cost. */
828 0, /* vec_to_scalar_cost. */
829 2, /* scalar_to_vec_cost. */
830 2, /* vec_align_load_cost. */
831 2, /* vec_unalign_load_cost. */
832 2, /* vec_store_cost. */
833 2, /* cond_taken_branch_cost. */
834 1, /* cond_not_taken_branch_cost. */
837 struct processor_costs bdver1_cost = {
838 COSTS_N_INSNS (1), /* cost of an add instruction */
839 COSTS_N_INSNS (1), /* cost of a lea instruction */
840 COSTS_N_INSNS (1), /* variable shift costs */
841 COSTS_N_INSNS (1), /* constant shift costs */
842 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
843 COSTS_N_INSNS (4), /* HI */
844 COSTS_N_INSNS (4), /* SI */
845 COSTS_N_INSNS (6), /* DI */
846 COSTS_N_INSNS (6)}, /* other */
847 0, /* cost of multiply per each bit set */
848 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
849 COSTS_N_INSNS (35), /* HI */
850 COSTS_N_INSNS (51), /* SI */
851 COSTS_N_INSNS (83), /* DI */
852 COSTS_N_INSNS (83)}, /* other */
853 COSTS_N_INSNS (1), /* cost of movsx */
854 COSTS_N_INSNS (1), /* cost of movzx */
855 8, /* "large" insn */
856 9, /* MOVE_RATIO */
857 4, /* cost for loading QImode using movzbl */
858 {5, 5, 4}, /* cost of loading integer registers
859 in QImode, HImode and SImode.
860 Relative to reg-reg move (2). */
861 {4, 4, 4}, /* cost of storing integer registers */
862 2, /* cost of reg,reg fld/fst */
863 {5, 5, 12}, /* cost of loading fp registers
864 in SFmode, DFmode and XFmode */
865 {4, 4, 8}, /* cost of storing fp registers
866 in SFmode, DFmode and XFmode */
867 2, /* cost of moving MMX register */
868 {4, 4}, /* cost of loading MMX registers
869 in SImode and DImode */
870 {4, 4}, /* cost of storing MMX registers
871 in SImode and DImode */
872 2, /* cost of moving SSE register */
873 {4, 4, 4}, /* cost of loading SSE registers
874 in SImode, DImode and TImode */
875 {4, 4, 4}, /* cost of storing SSE registers
876 in SImode, DImode and TImode */
877 2, /* MMX or SSE register to integer */
878 /* On K8:
879 MOVD reg64, xmmreg Double FSTORE 4
880 MOVD reg32, xmmreg Double FSTORE 4
881 On AMDFAM10:
882 MOVD reg64, xmmreg Double FADD 3
883 1/1 1/1
884 MOVD reg32, xmmreg Double FADD 3
885 1/1 1/1 */
886 16, /* size of l1 cache. */
887 2048, /* size of l2 cache. */
888 64, /* size of prefetch block */
889 /* New AMD processors never drop prefetches; if they cannot be performed
890 immediately, they are queued. We set number of simultaneous prefetches
891 to a large constant to reflect this (it probably is not a good idea not
892 to limit number of prefetches at all, as their execution also takes some
893 time). */
894 100, /* number of parallel prefetches */
895 2, /* Branch cost */
896 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
897 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
898 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
899 COSTS_N_INSNS (2), /* cost of FABS instruction. */
900 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
901 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
903 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
904 very small blocks it is better to use loop. For large blocks, libcall
905 can do nontemporary accesses and beat inline considerably. */
906 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
907 {-1, rep_prefix_4_byte, false}}},
908 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
909 {-1, libcall, false}}}},
910 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
911 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
912 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
913 {-1, libcall, false}}}},
914 6, /* scalar_stmt_cost. */
915 4, /* scalar load_cost. */
916 4, /* scalar_store_cost. */
917 6, /* vec_stmt_cost. */
918 0, /* vec_to_scalar_cost. */
919 2, /* scalar_to_vec_cost. */
920 4, /* vec_align_load_cost. */
921 4, /* vec_unalign_load_cost. */
922 4, /* vec_store_cost. */
923 2, /* cond_taken_branch_cost. */
924 1, /* cond_not_taken_branch_cost. */
927 struct processor_costs bdver2_cost = {
928 COSTS_N_INSNS (1), /* cost of an add instruction */
929 COSTS_N_INSNS (1), /* cost of a lea instruction */
930 COSTS_N_INSNS (1), /* variable shift costs */
931 COSTS_N_INSNS (1), /* constant shift costs */
932 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
933 COSTS_N_INSNS (4), /* HI */
934 COSTS_N_INSNS (4), /* SI */
935 COSTS_N_INSNS (6), /* DI */
936 COSTS_N_INSNS (6)}, /* other */
937 0, /* cost of multiply per each bit set */
938 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
939 COSTS_N_INSNS (35), /* HI */
940 COSTS_N_INSNS (51), /* SI */
941 COSTS_N_INSNS (83), /* DI */
942 COSTS_N_INSNS (83)}, /* other */
943 COSTS_N_INSNS (1), /* cost of movsx */
944 COSTS_N_INSNS (1), /* cost of movzx */
945 8, /* "large" insn */
946 9, /* MOVE_RATIO */
947 4, /* cost for loading QImode using movzbl */
948 {5, 5, 4}, /* cost of loading integer registers
949 in QImode, HImode and SImode.
950 Relative to reg-reg move (2). */
951 {4, 4, 4}, /* cost of storing integer registers */
952 2, /* cost of reg,reg fld/fst */
953 {5, 5, 12}, /* cost of loading fp registers
954 in SFmode, DFmode and XFmode */
955 {4, 4, 8}, /* cost of storing fp registers
956 in SFmode, DFmode and XFmode */
957 2, /* cost of moving MMX register */
958 {4, 4}, /* cost of loading MMX registers
959 in SImode and DImode */
960 {4, 4}, /* cost of storing MMX registers
961 in SImode and DImode */
962 2, /* cost of moving SSE register */
963 {4, 4, 4}, /* cost of loading SSE registers
964 in SImode, DImode and TImode */
965 {4, 4, 4}, /* cost of storing SSE registers
966 in SImode, DImode and TImode */
967 2, /* MMX or SSE register to integer */
968 /* On K8:
969 MOVD reg64, xmmreg Double FSTORE 4
970 MOVD reg32, xmmreg Double FSTORE 4
971 On AMDFAM10:
972 MOVD reg64, xmmreg Double FADD 3
973 1/1 1/1
974 MOVD reg32, xmmreg Double FADD 3
975 1/1 1/1 */
976 16, /* size of l1 cache. */
977 2048, /* size of l2 cache. */
978 64, /* size of prefetch block */
979 /* New AMD processors never drop prefetches; if they cannot be performed
980 immediately, they are queued. We set number of simultaneous prefetches
981 to a large constant to reflect this (it probably is not a good idea not
982 to limit number of prefetches at all, as their execution also takes some
983 time). */
984 100, /* number of parallel prefetches */
985 2, /* Branch cost */
986 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
987 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
988 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
989 COSTS_N_INSNS (2), /* cost of FABS instruction. */
990 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
991 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
993 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
994 very small blocks it is better to use loop. For large blocks, libcall
995 can do nontemporary accesses and beat inline considerably. */
996 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
997 {-1, rep_prefix_4_byte, false}}},
998 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
999 {-1, libcall, false}}}},
1000 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
1001 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1002 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1003 {-1, libcall, false}}}},
1004 6, /* scalar_stmt_cost. */
1005 4, /* scalar load_cost. */
1006 4, /* scalar_store_cost. */
1007 6, /* vec_stmt_cost. */
1008 0, /* vec_to_scalar_cost. */
1009 2, /* scalar_to_vec_cost. */
1010 4, /* vec_align_load_cost. */
1011 4, /* vec_unalign_load_cost. */
1012 4, /* vec_store_cost. */
1013 2, /* cond_taken_branch_cost. */
1014 1, /* cond_not_taken_branch_cost. */
1017 struct processor_costs bdver3_cost = {
1018 COSTS_N_INSNS (1), /* cost of an add instruction */
1019 COSTS_N_INSNS (1), /* cost of a lea instruction */
1020 COSTS_N_INSNS (1), /* variable shift costs */
1021 COSTS_N_INSNS (1), /* constant shift costs */
1022 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1023 COSTS_N_INSNS (4), /* HI */
1024 COSTS_N_INSNS (4), /* SI */
1025 COSTS_N_INSNS (6), /* DI */
1026 COSTS_N_INSNS (6)}, /* other */
1027 0, /* cost of multiply per each bit set */
1028 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1029 COSTS_N_INSNS (35), /* HI */
1030 COSTS_N_INSNS (51), /* SI */
1031 COSTS_N_INSNS (83), /* DI */
1032 COSTS_N_INSNS (83)}, /* other */
1033 COSTS_N_INSNS (1), /* cost of movsx */
1034 COSTS_N_INSNS (1), /* cost of movzx */
1035 8, /* "large" insn */
1036 9, /* MOVE_RATIO */
1037 4, /* cost for loading QImode using movzbl */
1038 {5, 5, 4}, /* cost of loading integer registers
1039 in QImode, HImode and SImode.
1040 Relative to reg-reg move (2). */
1041 {4, 4, 4}, /* cost of storing integer registers */
1042 2, /* cost of reg,reg fld/fst */
1043 {5, 5, 12}, /* cost of loading fp registers
1044 in SFmode, DFmode and XFmode */
1045 {4, 4, 8}, /* cost of storing fp registers
1046 in SFmode, DFmode and XFmode */
1047 2, /* cost of moving MMX register */
1048 {4, 4}, /* cost of loading MMX registers
1049 in SImode and DImode */
1050 {4, 4}, /* cost of storing MMX registers
1051 in SImode and DImode */
1052 2, /* cost of moving SSE register */
1053 {4, 4, 4}, /* cost of loading SSE registers
1054 in SImode, DImode and TImode */
1055 {4, 4, 4}, /* cost of storing SSE registers
1056 in SImode, DImode and TImode */
1057 2, /* MMX or SSE register to integer */
1058 16, /* size of l1 cache. */
1059 2048, /* size of l2 cache. */
1060 64, /* size of prefetch block */
1061 /* New AMD processors never drop prefetches; if they cannot be performed
1062 immediately, they are queued. We set number of simultaneous prefetches
1063 to a large constant to reflect this (it probably is not a good idea not
1064 to limit number of prefetches at all, as their execution also takes some
1065 time). */
1066 100, /* number of parallel prefetches */
1067 2, /* Branch cost */
1068 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1069 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1070 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1071 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1072 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1073 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1075 /* BDVER3 has optimized REP instruction for medium sized blocks, but for
1076 very small blocks it is better to use loop. For large blocks, libcall
1077 can do nontemporary accesses and beat inline considerably. */
1078 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
1079 {-1, rep_prefix_4_byte, false}}},
1080 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1081 {-1, libcall, false}}}},
1082 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
1083 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1084 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1085 {-1, libcall, false}}}},
1086 6, /* scalar_stmt_cost. */
1087 4, /* scalar load_cost. */
1088 4, /* scalar_store_cost. */
1089 6, /* vec_stmt_cost. */
1090 0, /* vec_to_scalar_cost. */
1091 2, /* scalar_to_vec_cost. */
1092 4, /* vec_align_load_cost. */
1093 4, /* vec_unalign_load_cost. */
1094 4, /* vec_store_cost. */
1095 2, /* cond_taken_branch_cost. */
1096 1, /* cond_not_taken_branch_cost. */
1099 struct processor_costs btver1_cost = {
1100 COSTS_N_INSNS (1), /* cost of an add instruction */
1101 COSTS_N_INSNS (2), /* cost of a lea instruction */
1102 COSTS_N_INSNS (1), /* variable shift costs */
1103 COSTS_N_INSNS (1), /* constant shift costs */
1104 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1105 COSTS_N_INSNS (4), /* HI */
1106 COSTS_N_INSNS (3), /* SI */
1107 COSTS_N_INSNS (4), /* DI */
1108 COSTS_N_INSNS (5)}, /* other */
1109 0, /* cost of multiply per each bit set */
1110 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1111 COSTS_N_INSNS (35), /* HI */
1112 COSTS_N_INSNS (51), /* SI */
1113 COSTS_N_INSNS (83), /* DI */
1114 COSTS_N_INSNS (83)}, /* other */
1115 COSTS_N_INSNS (1), /* cost of movsx */
1116 COSTS_N_INSNS (1), /* cost of movzx */
1117 8, /* "large" insn */
1118 9, /* MOVE_RATIO */
1119 4, /* cost for loading QImode using movzbl */
1120 {3, 4, 3}, /* cost of loading integer registers
1121 in QImode, HImode and SImode.
1122 Relative to reg-reg move (2). */
1123 {3, 4, 3}, /* cost of storing integer registers */
1124 4, /* cost of reg,reg fld/fst */
1125 {4, 4, 12}, /* cost of loading fp registers
1126 in SFmode, DFmode and XFmode */
1127 {6, 6, 8}, /* cost of storing fp registers
1128 in SFmode, DFmode and XFmode */
1129 2, /* cost of moving MMX register */
1130 {3, 3}, /* cost of loading MMX registers
1131 in SImode and DImode */
1132 {4, 4}, /* cost of storing MMX registers
1133 in SImode and DImode */
1134 2, /* cost of moving SSE register */
1135 {4, 4, 3}, /* cost of loading SSE registers
1136 in SImode, DImode and TImode */
1137 {4, 4, 5}, /* cost of storing SSE registers
1138 in SImode, DImode and TImode */
1139 3, /* MMX or SSE register to integer */
1140 /* On K8:
1141 MOVD reg64, xmmreg Double FSTORE 4
1142 MOVD reg32, xmmreg Double FSTORE 4
1143 On AMDFAM10:
1144 MOVD reg64, xmmreg Double FADD 3
1145 1/1 1/1
1146 MOVD reg32, xmmreg Double FADD 3
1147 1/1 1/1 */
1148 32, /* size of l1 cache. */
1149 512, /* size of l2 cache. */
1150 64, /* size of prefetch block */
1151 100, /* number of parallel prefetches */
1152 2, /* Branch cost */
1153 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1154 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1155 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1156 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1157 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1158 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1160 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1161 very small blocks it is better to use loop. For large blocks, libcall can
1162 do nontemporary accesses and beat inline considerably. */
1163 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
1164 {-1, rep_prefix_4_byte, false}}},
1165 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1166 {-1, libcall, false}}}},
1167 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
1168 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1169 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1170 {-1, libcall, false}}}},
1171 4, /* scalar_stmt_cost. */
1172 2, /* scalar load_cost. */
1173 2, /* scalar_store_cost. */
1174 6, /* vec_stmt_cost. */
1175 0, /* vec_to_scalar_cost. */
1176 2, /* scalar_to_vec_cost. */
1177 2, /* vec_align_load_cost. */
1178 2, /* vec_unalign_load_cost. */
1179 2, /* vec_store_cost. */
1180 2, /* cond_taken_branch_cost. */
1181 1, /* cond_not_taken_branch_cost. */
1184 struct processor_costs btver2_cost = {
1185 COSTS_N_INSNS (1), /* cost of an add instruction */
1186 COSTS_N_INSNS (2), /* cost of a lea instruction */
1187 COSTS_N_INSNS (1), /* variable shift costs */
1188 COSTS_N_INSNS (1), /* constant shift costs */
1189 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1190 COSTS_N_INSNS (4), /* HI */
1191 COSTS_N_INSNS (3), /* SI */
1192 COSTS_N_INSNS (4), /* DI */
1193 COSTS_N_INSNS (5)}, /* other */
1194 0, /* cost of multiply per each bit set */
1195 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1196 COSTS_N_INSNS (35), /* HI */
1197 COSTS_N_INSNS (51), /* SI */
1198 COSTS_N_INSNS (83), /* DI */
1199 COSTS_N_INSNS (83)}, /* other */
1200 COSTS_N_INSNS (1), /* cost of movsx */
1201 COSTS_N_INSNS (1), /* cost of movzx */
1202 8, /* "large" insn */
1203 9, /* MOVE_RATIO */
1204 4, /* cost for loading QImode using movzbl */
1205 {3, 4, 3}, /* cost of loading integer registers
1206 in QImode, HImode and SImode.
1207 Relative to reg-reg move (2). */
1208 {3, 4, 3}, /* cost of storing integer registers */
1209 4, /* cost of reg,reg fld/fst */
1210 {4, 4, 12}, /* cost of loading fp registers
1211 in SFmode, DFmode and XFmode */
1212 {6, 6, 8}, /* cost of storing fp registers
1213 in SFmode, DFmode and XFmode */
1214 2, /* cost of moving MMX register */
1215 {3, 3}, /* cost of loading MMX registers
1216 in SImode and DImode */
1217 {4, 4}, /* cost of storing MMX registers
1218 in SImode and DImode */
1219 2, /* cost of moving SSE register */
1220 {4, 4, 3}, /* cost of loading SSE registers
1221 in SImode, DImode and TImode */
1222 {4, 4, 5}, /* cost of storing SSE registers
1223 in SImode, DImode and TImode */
1224 3, /* MMX or SSE register to integer */
1225 /* On K8:
1226 MOVD reg64, xmmreg Double FSTORE 4
1227 MOVD reg32, xmmreg Double FSTORE 4
1228 On AMDFAM10:
1229 MOVD reg64, xmmreg Double FADD 3
1230 1/1 1/1
1231 MOVD reg32, xmmreg Double FADD 3
1232 1/1 1/1 */
1233 32, /* size of l1 cache. */
1234 2048, /* size of l2 cache. */
1235 64, /* size of prefetch block */
1236 100, /* number of parallel prefetches */
1237 2, /* Branch cost */
1238 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1239 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1240 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1241 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1242 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1243 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1245 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
1246 {-1, rep_prefix_4_byte, false}}},
1247 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1248 {-1, libcall, false}}}},
1249 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
1250 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1251 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1252 {-1, libcall, false}}}},
1253 4, /* scalar_stmt_cost. */
1254 2, /* scalar load_cost. */
1255 2, /* scalar_store_cost. */
1256 6, /* vec_stmt_cost. */
1257 0, /* vec_to_scalar_cost. */
1258 2, /* scalar_to_vec_cost. */
1259 2, /* vec_align_load_cost. */
1260 2, /* vec_unalign_load_cost. */
1261 2, /* vec_store_cost. */
1262 2, /* cond_taken_branch_cost. */
1263 1, /* cond_not_taken_branch_cost. */
1266 static const
1267 struct processor_costs pentium4_cost = {
1268 COSTS_N_INSNS (1), /* cost of an add instruction */
1269 COSTS_N_INSNS (3), /* cost of a lea instruction */
1270 COSTS_N_INSNS (4), /* variable shift costs */
1271 COSTS_N_INSNS (4), /* constant shift costs */
1272 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1273 COSTS_N_INSNS (15), /* HI */
1274 COSTS_N_INSNS (15), /* SI */
1275 COSTS_N_INSNS (15), /* DI */
1276 COSTS_N_INSNS (15)}, /* other */
1277 0, /* cost of multiply per each bit set */
1278 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1279 COSTS_N_INSNS (56), /* HI */
1280 COSTS_N_INSNS (56), /* SI */
1281 COSTS_N_INSNS (56), /* DI */
1282 COSTS_N_INSNS (56)}, /* other */
1283 COSTS_N_INSNS (1), /* cost of movsx */
1284 COSTS_N_INSNS (1), /* cost of movzx */
1285 16, /* "large" insn */
1286 6, /* MOVE_RATIO */
1287 2, /* cost for loading QImode using movzbl */
1288 {4, 5, 4}, /* cost of loading integer registers
1289 in QImode, HImode and SImode.
1290 Relative to reg-reg move (2). */
1291 {2, 3, 2}, /* cost of storing integer registers */
1292 2, /* cost of reg,reg fld/fst */
1293 {2, 2, 6}, /* cost of loading fp registers
1294 in SFmode, DFmode and XFmode */
1295 {4, 4, 6}, /* cost of storing fp registers
1296 in SFmode, DFmode and XFmode */
1297 2, /* cost of moving MMX register */
1298 {2, 2}, /* cost of loading MMX registers
1299 in SImode and DImode */
1300 {2, 2}, /* cost of storing MMX registers
1301 in SImode and DImode */
1302 12, /* cost of moving SSE register */
1303 {12, 12, 12}, /* cost of loading SSE registers
1304 in SImode, DImode and TImode */
1305 {2, 2, 8}, /* cost of storing SSE registers
1306 in SImode, DImode and TImode */
1307 10, /* MMX or SSE register to integer */
1308 8, /* size of l1 cache. */
1309 256, /* size of l2 cache. */
1310 64, /* size of prefetch block */
1311 6, /* number of parallel prefetches */
1312 2, /* Branch cost */
1313 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1314 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1315 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1316 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1317 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1318 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1319 {{libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1320 DUMMY_STRINGOP_ALGS},
1321 {{libcall, {{6, loop_1_byte, false}, {48, loop, false},
1322 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1323 DUMMY_STRINGOP_ALGS},
1324 1, /* scalar_stmt_cost. */
1325 1, /* scalar load_cost. */
1326 1, /* scalar_store_cost. */
1327 1, /* vec_stmt_cost. */
1328 1, /* vec_to_scalar_cost. */
1329 1, /* scalar_to_vec_cost. */
1330 1, /* vec_align_load_cost. */
1331 2, /* vec_unalign_load_cost. */
1332 1, /* vec_store_cost. */
1333 3, /* cond_taken_branch_cost. */
1334 1, /* cond_not_taken_branch_cost. */
1337 static const
1338 struct processor_costs nocona_cost = {
1339 COSTS_N_INSNS (1), /* cost of an add instruction */
1340 COSTS_N_INSNS (1), /* cost of a lea instruction */
1341 COSTS_N_INSNS (1), /* variable shift costs */
1342 COSTS_N_INSNS (1), /* constant shift costs */
1343 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1344 COSTS_N_INSNS (10), /* HI */
1345 COSTS_N_INSNS (10), /* SI */
1346 COSTS_N_INSNS (10), /* DI */
1347 COSTS_N_INSNS (10)}, /* other */
1348 0, /* cost of multiply per each bit set */
1349 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1350 COSTS_N_INSNS (66), /* HI */
1351 COSTS_N_INSNS (66), /* SI */
1352 COSTS_N_INSNS (66), /* DI */
1353 COSTS_N_INSNS (66)}, /* other */
1354 COSTS_N_INSNS (1), /* cost of movsx */
1355 COSTS_N_INSNS (1), /* cost of movzx */
1356 16, /* "large" insn */
1357 17, /* MOVE_RATIO */
1358 4, /* cost for loading QImode using movzbl */
1359 {4, 4, 4}, /* cost of loading integer registers
1360 in QImode, HImode and SImode.
1361 Relative to reg-reg move (2). */
1362 {4, 4, 4}, /* cost of storing integer registers */
1363 3, /* cost of reg,reg fld/fst */
1364 {12, 12, 12}, /* cost of loading fp registers
1365 in SFmode, DFmode and XFmode */
1366 {4, 4, 4}, /* cost of storing fp registers
1367 in SFmode, DFmode and XFmode */
1368 6, /* cost of moving MMX register */
1369 {12, 12}, /* cost of loading MMX registers
1370 in SImode and DImode */
1371 {12, 12}, /* cost of storing MMX registers
1372 in SImode and DImode */
1373 6, /* cost of moving SSE register */
1374 {12, 12, 12}, /* cost of loading SSE registers
1375 in SImode, DImode and TImode */
1376 {12, 12, 12}, /* cost of storing SSE registers
1377 in SImode, DImode and TImode */
1378 8, /* MMX or SSE register to integer */
1379 8, /* size of l1 cache. */
1380 1024, /* size of l2 cache. */
1381 128, /* size of prefetch block */
1382 8, /* number of parallel prefetches */
1383 1, /* Branch cost */
1384 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1385 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1386 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1387 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1388 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1389 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1390 {{libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1391 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1392 {100000, unrolled_loop, false}, {-1, libcall, false}}}},
1393 {{libcall, {{6, loop_1_byte, false}, {48, loop, false},
1394 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1395 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1396 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
1397 1, /* scalar_stmt_cost. */
1398 1, /* scalar load_cost. */
1399 1, /* scalar_store_cost. */
1400 1, /* vec_stmt_cost. */
1401 1, /* vec_to_scalar_cost. */
1402 1, /* scalar_to_vec_cost. */
1403 1, /* vec_align_load_cost. */
1404 2, /* vec_unalign_load_cost. */
1405 1, /* vec_store_cost. */
1406 3, /* cond_taken_branch_cost. */
1407 1, /* cond_not_taken_branch_cost. */
1410 static const
1411 struct processor_costs atom_cost = {
1412 COSTS_N_INSNS (1), /* cost of an add instruction */
1413 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1414 COSTS_N_INSNS (1), /* variable shift costs */
1415 COSTS_N_INSNS (1), /* constant shift costs */
1416 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1417 COSTS_N_INSNS (4), /* HI */
1418 COSTS_N_INSNS (3), /* SI */
1419 COSTS_N_INSNS (4), /* DI */
1420 COSTS_N_INSNS (2)}, /* other */
1421 0, /* cost of multiply per each bit set */
1422 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1423 COSTS_N_INSNS (26), /* HI */
1424 COSTS_N_INSNS (42), /* SI */
1425 COSTS_N_INSNS (74), /* DI */
1426 COSTS_N_INSNS (74)}, /* other */
1427 COSTS_N_INSNS (1), /* cost of movsx */
1428 COSTS_N_INSNS (1), /* cost of movzx */
1429 8, /* "large" insn */
1430 17, /* MOVE_RATIO */
1431 4, /* cost for loading QImode using movzbl */
1432 {4, 4, 4}, /* cost of loading integer registers
1433 in QImode, HImode and SImode.
1434 Relative to reg-reg move (2). */
1435 {4, 4, 4}, /* cost of storing integer registers */
1436 4, /* cost of reg,reg fld/fst */
1437 {12, 12, 12}, /* cost of loading fp registers
1438 in SFmode, DFmode and XFmode */
1439 {6, 6, 8}, /* cost of storing fp registers
1440 in SFmode, DFmode and XFmode */
1441 2, /* cost of moving MMX register */
1442 {8, 8}, /* cost of loading MMX registers
1443 in SImode and DImode */
1444 {8, 8}, /* cost of storing MMX registers
1445 in SImode and DImode */
1446 2, /* cost of moving SSE register */
1447 {8, 8, 8}, /* cost of loading SSE registers
1448 in SImode, DImode and TImode */
1449 {8, 8, 8}, /* cost of storing SSE registers
1450 in SImode, DImode and TImode */
1451 5, /* MMX or SSE register to integer */
1452 32, /* size of l1 cache. */
1453 256, /* size of l2 cache. */
1454 64, /* size of prefetch block */
1455 6, /* number of parallel prefetches */
1456 3, /* Branch cost */
1457 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1458 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1459 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1460 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1461 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1462 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1463 {{libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1464 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1465 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
1466 {{libcall, {{8, loop, false}, {15, unrolled_loop, false},
1467 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1468 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1469 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
1470 1, /* scalar_stmt_cost. */
1471 1, /* scalar load_cost. */
1472 1, /* scalar_store_cost. */
1473 1, /* vec_stmt_cost. */
1474 1, /* vec_to_scalar_cost. */
1475 1, /* scalar_to_vec_cost. */
1476 1, /* vec_align_load_cost. */
1477 2, /* vec_unalign_load_cost. */
1478 1, /* vec_store_cost. */
1479 3, /* cond_taken_branch_cost. */
1480 1, /* cond_not_taken_branch_cost. */
1483 /* Generic64 should produce code tuned for Nocona and K8. */
1484 static const
1485 struct processor_costs generic64_cost = {
1486 COSTS_N_INSNS (1), /* cost of an add instruction */
1487 /* On all chips taken into consideration lea is 2 cycles and more. With
1488 this cost however our current implementation of synth_mult results in
1489 use of unnecessary temporary registers causing regression on several
1490 SPECfp benchmarks. */
1491 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1492 COSTS_N_INSNS (1), /* variable shift costs */
1493 COSTS_N_INSNS (1), /* constant shift costs */
1494 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1495 COSTS_N_INSNS (4), /* HI */
1496 COSTS_N_INSNS (3), /* SI */
1497 COSTS_N_INSNS (4), /* DI */
1498 COSTS_N_INSNS (2)}, /* other */
1499 0, /* cost of multiply per each bit set */
1500 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1501 COSTS_N_INSNS (26), /* HI */
1502 COSTS_N_INSNS (42), /* SI */
1503 COSTS_N_INSNS (74), /* DI */
1504 COSTS_N_INSNS (74)}, /* other */
1505 COSTS_N_INSNS (1), /* cost of movsx */
1506 COSTS_N_INSNS (1), /* cost of movzx */
1507 8, /* "large" insn */
1508 17, /* MOVE_RATIO */
1509 4, /* cost for loading QImode using movzbl */
1510 {4, 4, 4}, /* cost of loading integer registers
1511 in QImode, HImode and SImode.
1512 Relative to reg-reg move (2). */
1513 {4, 4, 4}, /* cost of storing integer registers */
1514 4, /* cost of reg,reg fld/fst */
1515 {12, 12, 12}, /* cost of loading fp registers
1516 in SFmode, DFmode and XFmode */
1517 {6, 6, 8}, /* cost of storing fp registers
1518 in SFmode, DFmode and XFmode */
1519 2, /* cost of moving MMX register */
1520 {8, 8}, /* cost of loading MMX registers
1521 in SImode and DImode */
1522 {8, 8}, /* cost of storing MMX registers
1523 in SImode and DImode */
1524 2, /* cost of moving SSE register */
1525 {8, 8, 8}, /* cost of loading SSE registers
1526 in SImode, DImode and TImode */
1527 {8, 8, 8}, /* cost of storing SSE registers
1528 in SImode, DImode and TImode */
1529 5, /* MMX or SSE register to integer */
1530 32, /* size of l1 cache. */
1531 512, /* size of l2 cache. */
1532 64, /* size of prefetch block */
1533 6, /* number of parallel prefetches */
1534 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1535 value is increased to perhaps more appropriate value of 5. */
1536 3, /* Branch cost */
1537 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1538 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1539 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1540 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1541 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1542 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1543 {DUMMY_STRINGOP_ALGS,
1544 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1545 {-1, libcall, false}}}},
1546 {DUMMY_STRINGOP_ALGS,
1547 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1548 {-1, libcall, false}}}},
1549 1, /* scalar_stmt_cost. */
1550 1, /* scalar load_cost. */
1551 1, /* scalar_store_cost. */
1552 1, /* vec_stmt_cost. */
1553 1, /* vec_to_scalar_cost. */
1554 1, /* scalar_to_vec_cost. */
1555 1, /* vec_align_load_cost. */
1556 2, /* vec_unalign_load_cost. */
1557 1, /* vec_store_cost. */
1558 3, /* cond_taken_branch_cost. */
1559 1, /* cond_not_taken_branch_cost. */
1562 /* core_cost should produce code tuned for Core familly of CPUs. */
1563 static const
1564 struct processor_costs core_cost = {
1565 COSTS_N_INSNS (1), /* cost of an add instruction */
1566 /* On all chips taken into consideration lea is 2 cycles and more. With
1567 this cost however our current implementation of synth_mult results in
1568 use of unnecessary temporary registers causing regression on several
1569 SPECfp benchmarks. */
1570 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1571 COSTS_N_INSNS (1), /* variable shift costs */
1572 COSTS_N_INSNS (1), /* constant shift costs */
1573 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1574 COSTS_N_INSNS (4), /* HI */
1575 COSTS_N_INSNS (3), /* SI */
1576 COSTS_N_INSNS (4), /* DI */
1577 COSTS_N_INSNS (2)}, /* other */
1578 0, /* cost of multiply per each bit set */
1579 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1580 COSTS_N_INSNS (26), /* HI */
1581 COSTS_N_INSNS (42), /* SI */
1582 COSTS_N_INSNS (74), /* DI */
1583 COSTS_N_INSNS (74)}, /* other */
1584 COSTS_N_INSNS (1), /* cost of movsx */
1585 COSTS_N_INSNS (1), /* cost of movzx */
1586 8, /* "large" insn */
1587 17, /* MOVE_RATIO */
1588 4, /* cost for loading QImode using movzbl */
1589 {4, 4, 4}, /* cost of loading integer registers
1590 in QImode, HImode and SImode.
1591 Relative to reg-reg move (2). */
1592 {4, 4, 4}, /* cost of storing integer registers */
1593 4, /* cost of reg,reg fld/fst */
1594 {12, 12, 12}, /* cost of loading fp registers
1595 in SFmode, DFmode and XFmode */
1596 {6, 6, 8}, /* cost of storing fp registers
1597 in SFmode, DFmode and XFmode */
1598 2, /* cost of moving MMX register */
1599 {8, 8}, /* cost of loading MMX registers
1600 in SImode and DImode */
1601 {8, 8}, /* cost of storing MMX registers
1602 in SImode and DImode */
1603 2, /* cost of moving SSE register */
1604 {8, 8, 8}, /* cost of loading SSE registers
1605 in SImode, DImode and TImode */
1606 {8, 8, 8}, /* cost of storing SSE registers
1607 in SImode, DImode and TImode */
1608 5, /* MMX or SSE register to integer */
1609 64, /* size of l1 cache. */
1610 512, /* size of l2 cache. */
1611 64, /* size of prefetch block */
1612 6, /* number of parallel prefetches */
1613 /* FIXME perhaps more appropriate value is 5. */
1614 3, /* Branch cost */
1615 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1616 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1617 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1618 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1619 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1620 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1621 {{libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
1622 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
1623 {-1, libcall, false}}}},
1624 {{libcall, {{6, loop_1_byte, true},
1625 {24, loop, true},
1626 {8192, rep_prefix_4_byte, true},
1627 {-1, libcall, false}}},
1628 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
1629 {-1, libcall, false}}}},
1630 1, /* scalar_stmt_cost. */
1631 1, /* scalar load_cost. */
1632 1, /* scalar_store_cost. */
1633 1, /* vec_stmt_cost. */
1634 1, /* vec_to_scalar_cost. */
1635 1, /* scalar_to_vec_cost. */
1636 1, /* vec_align_load_cost. */
1637 2, /* vec_unalign_load_cost. */
1638 1, /* vec_store_cost. */
1639 3, /* cond_taken_branch_cost. */
1640 1, /* cond_not_taken_branch_cost. */
1643 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
1644 Athlon and K8. */
1645 static const
1646 struct processor_costs generic32_cost = {
1647 COSTS_N_INSNS (1), /* cost of an add instruction */
1648 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1649 COSTS_N_INSNS (1), /* variable shift costs */
1650 COSTS_N_INSNS (1), /* constant shift costs */
1651 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1652 COSTS_N_INSNS (4), /* HI */
1653 COSTS_N_INSNS (3), /* SI */
1654 COSTS_N_INSNS (4), /* DI */
1655 COSTS_N_INSNS (2)}, /* other */
1656 0, /* cost of multiply per each bit set */
1657 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1658 COSTS_N_INSNS (26), /* HI */
1659 COSTS_N_INSNS (42), /* SI */
1660 COSTS_N_INSNS (74), /* DI */
1661 COSTS_N_INSNS (74)}, /* other */
1662 COSTS_N_INSNS (1), /* cost of movsx */
1663 COSTS_N_INSNS (1), /* cost of movzx */
1664 8, /* "large" insn */
1665 17, /* MOVE_RATIO */
1666 4, /* cost for loading QImode using movzbl */
1667 {4, 4, 4}, /* cost of loading integer registers
1668 in QImode, HImode and SImode.
1669 Relative to reg-reg move (2). */
1670 {4, 4, 4}, /* cost of storing integer registers */
1671 4, /* cost of reg,reg fld/fst */
1672 {12, 12, 12}, /* cost of loading fp registers
1673 in SFmode, DFmode and XFmode */
1674 {6, 6, 8}, /* cost of storing fp registers
1675 in SFmode, DFmode and XFmode */
1676 2, /* cost of moving MMX register */
1677 {8, 8}, /* cost of loading MMX registers
1678 in SImode and DImode */
1679 {8, 8}, /* cost of storing MMX registers
1680 in SImode and DImode */
1681 2, /* cost of moving SSE register */
1682 {8, 8, 8}, /* cost of loading SSE registers
1683 in SImode, DImode and TImode */
1684 {8, 8, 8}, /* cost of storing SSE registers
1685 in SImode, DImode and TImode */
1686 5, /* MMX or SSE register to integer */
1687 32, /* size of l1 cache. */
1688 256, /* size of l2 cache. */
1689 64, /* size of prefetch block */
1690 6, /* number of parallel prefetches */
1691 3, /* Branch cost */
1692 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1693 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1694 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1695 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1696 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1697 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1698 {{libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1699 {-1, libcall, false}}},
1700 DUMMY_STRINGOP_ALGS},
1701 {{libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1702 {-1, libcall, false}}},
1703 DUMMY_STRINGOP_ALGS},
1704 1, /* scalar_stmt_cost. */
1705 1, /* scalar load_cost. */
1706 1, /* scalar_store_cost. */
1707 1, /* vec_stmt_cost. */
1708 1, /* vec_to_scalar_cost. */
1709 1, /* scalar_to_vec_cost. */
1710 1, /* vec_align_load_cost. */
1711 2, /* vec_unalign_load_cost. */
1712 1, /* vec_store_cost. */
1713 3, /* cond_taken_branch_cost. */
1714 1, /* cond_not_taken_branch_cost. */
1717 /* Set by -mtune. */
1718 const struct processor_costs *ix86_tune_cost = &pentium_cost;
1720 /* Set by -mtune or -Os. */
1721 const struct processor_costs *ix86_cost = &pentium_cost;
1723 /* Processor feature/optimization bitmasks. */
1724 #define m_386 (1<<PROCESSOR_I386)
1725 #define m_486 (1<<PROCESSOR_I486)
1726 #define m_PENT (1<<PROCESSOR_PENTIUM)
1727 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1728 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1729 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1730 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1731 #define m_CORE2 (1<<PROCESSOR_CORE2)
1732 #define m_COREI7 (1<<PROCESSOR_COREI7)
1733 #define m_HASWELL (1<<PROCESSOR_HASWELL)
1734 #define m_CORE_ALL (m_CORE2 | m_COREI7 | m_HASWELL)
1735 #define m_ATOM (1<<PROCESSOR_ATOM)
1737 #define m_GEODE (1<<PROCESSOR_GEODE)
1738 #define m_K6 (1<<PROCESSOR_K6)
1739 #define m_K6_GEODE (m_K6 | m_GEODE)
1740 #define m_K8 (1<<PROCESSOR_K8)
1741 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1742 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1743 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1744 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1745 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1746 #define m_BDVER3 (1<<PROCESSOR_BDVER3)
1747 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1748 #define m_BTVER2 (1<<PROCESSOR_BTVER2)
1749 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3)
1750 #define m_BTVER (m_BTVER1 | m_BTVER2)
1751 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER)
1753 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1754 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1756 /* Generic instruction choice should be common subset of supported CPUs
1757 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
1758 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
1760 /* Feature tests against the various tunings. */
1761 unsigned char ix86_tune_features[X86_TUNE_LAST];
1763 /* Feature tests against the various tunings used to create ix86_tune_features
1764 based on the processor mask. */
1765 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
1766 /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
1767 negatively, so enabling for Generic64 seems like good code size
1768 tradeoff. We can't enable it for 32bit generic because it does not
1769 work well with PPro base chips. */
1770 m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
1772 /* X86_TUNE_PUSH_MEMORY */
1773 m_386 | m_P4_NOCONA | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1775 /* X86_TUNE_ZERO_EXTEND_WITH_AND */
1776 m_486 | m_PENT,
1778 /* X86_TUNE_UNROLL_STRLEN */
1779 m_486 | m_PENT | m_PPRO | m_ATOM | m_CORE_ALL | m_K6 | m_AMD_MULTIPLE | m_GENERIC,
1781 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
1782 on simulation result. But after P4 was made, no performance benefit
1783 was observed with branch hints. It also increases the code size.
1784 As a result, icc never generates branch hints. */
1787 /* X86_TUNE_DOUBLE_WITH_ADD */
1788 ~m_386,
1790 /* X86_TUNE_USE_SAHF */
1791 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC,
1793 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
1794 partial dependencies. */
1795 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1797 /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
1798 register stalls on Generic32 compilation setting as well. However
1799 in current implementation the partial register stalls are not eliminated
1800 very well - they can be introduced via subregs synthesized by combine
1801 and can happen in caller/callee saving sequences. Because this option
1802 pays back little on PPro based chips and is in conflict with partial reg
1803 dependencies used by Athlon/P4 based chips, it is better to leave it off
1804 for generic32 for now. */
1805 m_PPRO,
1807 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
1808 m_CORE_ALL | m_GENERIC,
1810 /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
1811 * on 16-bit immediate moves into memory on Core2 and Corei7. */
1812 m_CORE_ALL | m_GENERIC,
1814 /* X86_TUNE_USE_HIMODE_FIOP */
1815 m_386 | m_486 | m_K6_GEODE,
1817 /* X86_TUNE_USE_SIMODE_FIOP */
1818 ~(m_PENT | m_PPRO | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC),
1820 /* X86_TUNE_USE_MOV0 */
1821 m_K6,
1823 /* X86_TUNE_USE_CLTD */
1824 ~(m_PENT | m_ATOM | m_K6),
1826 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
1827 m_PENT4,
1829 /* X86_TUNE_SPLIT_LONG_MOVES */
1830 m_PPRO,
1832 /* X86_TUNE_READ_MODIFY_WRITE */
1833 ~m_PENT,
1835 /* X86_TUNE_READ_MODIFY */
1836 ~(m_PENT | m_PPRO),
1838 /* X86_TUNE_PROMOTE_QIMODE */
1839 m_386 | m_486 | m_PENT | m_CORE_ALL | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1841 /* X86_TUNE_FAST_PREFIX */
1842 ~(m_386 | m_486 | m_PENT),
1844 /* X86_TUNE_SINGLE_STRINGOP */
1845 m_386 | m_P4_NOCONA,
1847 /* X86_TUNE_QIMODE_MATH */
1850 /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
1851 register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option
1852 might be considered for Generic32 if our scheme for avoiding partial
1853 stalls was more effective. */
1854 ~m_PPRO,
1856 /* X86_TUNE_PROMOTE_QI_REGS */
1859 /* X86_TUNE_PROMOTE_HI_REGS */
1860 m_PPRO,
1862 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
1863 over esp addition. */
1864 m_386 | m_486 | m_PENT | m_PPRO,
1866 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
1867 over esp addition. */
1868 m_PENT,
1870 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
1871 over esp subtraction. */
1872 m_386 | m_486 | m_PENT | m_K6_GEODE,
1874 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
1875 over esp subtraction. */
1876 m_PENT | m_K6_GEODE,
1878 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
1879 for DFmode copies */
1880 ~(m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_ATOM | m_GENERIC),
1882 /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
1883 m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
1885 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
1886 conflict here in between PPro/Pentium4 based chips that thread 128bit
1887 SSE registers as single units versus K8 based chips that divide SSE
1888 registers to two 64bit halves. This knob promotes all store destinations
1889 to be 128bit to allow register renaming on 128bit SSE units, but usually
1890 results in one extra microop on 64bit SSE units. Experimental results
1891 shows that disabling this option on P4 brings over 20% SPECfp regression,
1892 while enabling it on K8 brings roughly 2.4% regression that can be partly
1893 masked by careful scheduling of moves. */
1894 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMDFAM10 | m_BDVER | m_GENERIC,
1896 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
1897 m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER,
1899 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */
1900 m_COREI7 | m_BDVER,
1902 /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */
1903 m_BDVER ,
1905 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
1906 are resolved on SSE register parts instead of whole registers, so we may
1907 maintain just lower part of scalar values in proper format leaving the
1908 upper part undefined. */
1909 m_ATHLON_K8,
1911 /* X86_TUNE_SSE_TYPELESS_STORES */
1912 m_AMD_MULTIPLE,
1914 /* X86_TUNE_SSE_LOAD0_BY_PXOR */
1915 m_PPRO | m_P4_NOCONA,
1917 /* X86_TUNE_MEMORY_MISMATCH_STALL */
1918 m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
1920 /* X86_TUNE_PROLOGUE_USING_MOVE */
1921 m_PPRO | m_ATHLON_K8,
1923 /* X86_TUNE_EPILOGUE_USING_MOVE */
1924 m_PPRO | m_ATHLON_K8,
1926 /* X86_TUNE_SHIFT1 */
1927 ~m_486,
1929 /* X86_TUNE_USE_FFREEP */
1930 m_AMD_MULTIPLE,
1932 /* X86_TUNE_INTER_UNIT_MOVES */
1933 ~(m_AMD_MULTIPLE | m_GENERIC),
1935 /* X86_TUNE_INTER_UNIT_CONVERSIONS */
1936 ~(m_AMDFAM10 | m_BDVER ),
1938 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
1939 than 4 branch instructions in the 16 byte window. */
1940 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
1942 /* X86_TUNE_SCHEDULE */
1943 m_PENT | m_PPRO | m_CORE_ALL | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1945 /* X86_TUNE_USE_BT */
1946 m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
1948 /* X86_TUNE_USE_INCDEC */
1949 ~(m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_GENERIC),
1951 /* X86_TUNE_PAD_RETURNS */
1952 m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC,
1954 /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */
1955 m_ATOM,
1957 /* X86_TUNE_EXT_80387_CONSTANTS */
1958 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC,
1960 /* X86_TUNE_AVOID_VECTOR_DECODE */
1961 m_CORE_ALL | m_K8 | m_GENERIC64,
1963 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
1964 and SImode multiply, but 386 and 486 do HImode multiply faster. */
1965 ~(m_386 | m_486),
1967 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
1968 vector path on AMD machines. */
1969 m_CORE_ALL | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
1971 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
1972 machines. */
1973 m_CORE_ALL | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
1975 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
1976 than a MOV. */
1977 m_PENT,
1979 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
1980 but one byte longer. */
1981 m_PENT,
1983 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
1984 operand that cannot be represented using a modRM byte. The XOR
1985 replacement is long decoded, so this split helps here as well. */
1986 m_K6,
1988 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
1989 from FP to FP. */
1990 m_CORE_ALL | m_AMDFAM10 | m_GENERIC,
1992 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
1993 from integer to FP. */
1994 m_AMDFAM10,
1996 /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
1997 with a subsequent conditional jump instruction into a single
1998 compare-and-branch uop. */
1999 m_BDVER,
2001 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
2002 will impact LEA instruction selection. */
2003 m_ATOM,
2005 /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
2006 instructions. */
2007 ~m_ATOM,
2009 /* X86_SOFTARE_PREFETCHING_BENEFICIAL: Enable software prefetching
2010 at -O3. For the moment, the prefetching seems badly tuned for Intel
2011 chips. */
2012 m_K6_GEODE | m_AMD_MULTIPLE,
2014 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
2015 the auto-vectorizer. */
2016 m_BDVER | m_BTVER2,
2018 /* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations
2019 during reassociation of integer computation. */
2020 m_ATOM,
2022 /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
2023 during reassociation of fp computation. */
2024 m_ATOM,
2026 /* X86_TUNE_GENERAL_REGS_SSE_SPILL: Try to spill general regs to SSE
2027 regs instead of memory. */
2028 m_CORE_ALL,
2030 /* X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE: Try to avoid memory operands for
2031 a conditional move. */
2032 m_ATOM
2035 /* Feature tests against the various architecture variations. */
2036 unsigned char ix86_arch_features[X86_ARCH_LAST];
2038 /* Feature tests against the various architecture variations, used to create
2039 ix86_arch_features based on the processor mask. */
2040 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2041 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
2042 ~(m_386 | m_486 | m_PENT | m_K6),
2044 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2045 ~m_386,
2047 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2048 ~(m_386 | m_486),
2050 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2051 ~m_386,
2053 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2054 ~m_386,
2057 static const unsigned int x86_accumulate_outgoing_args
2058 = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC;
2060 static const unsigned int x86_arch_always_fancy_math_387
2061 = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC;
2063 static const unsigned int x86_avx256_split_unaligned_load
2064 = m_COREI7 | m_GENERIC;
2066 static const unsigned int x86_avx256_split_unaligned_store
2067 = m_COREI7 | m_BDVER | m_GENERIC;
2069 /* In case the average insn count for single function invocation is
2070 lower than this constant, emit fast (but longer) prologue and
2071 epilogue code. */
2072 #define FAST_PROLOGUE_INSN_COUNT 20
2074 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2075 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2076 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2077 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2079 /* Array of the smallest class containing reg number REGNO, indexed by
2080 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2082 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2084 /* ax, dx, cx, bx */
2085 AREG, DREG, CREG, BREG,
2086 /* si, di, bp, sp */
2087 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2088 /* FP registers */
2089 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2090 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2091 /* arg pointer */
2092 NON_Q_REGS,
2093 /* flags, fpsr, fpcr, frame */
2094 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2095 /* SSE registers */
2096 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2097 SSE_REGS, SSE_REGS,
2098 /* MMX registers */
2099 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2100 MMX_REGS, MMX_REGS,
2101 /* REX registers */
2102 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2103 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2104 /* SSE REX registers */
2105 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2106 SSE_REGS, SSE_REGS,
2109 /* The "default" register map used in 32bit mode. */
2111 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2113 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2114 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2115 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2116 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2117 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2118 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2119 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2122 /* The "default" register map used in 64bit mode. */
2124 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2126 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2127 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2128 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2129 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2130 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2131 8,9,10,11,12,13,14,15, /* extended integer registers */
2132 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2135 /* Define the register numbers to be used in Dwarf debugging information.
2136 The SVR4 reference port C compiler uses the following register numbers
2137 in its Dwarf output code:
2138 0 for %eax (gcc regno = 0)
2139 1 for %ecx (gcc regno = 2)
2140 2 for %edx (gcc regno = 1)
2141 3 for %ebx (gcc regno = 3)
2142 4 for %esp (gcc regno = 7)
2143 5 for %ebp (gcc regno = 6)
2144 6 for %esi (gcc regno = 4)
2145 7 for %edi (gcc regno = 5)
2146 The following three DWARF register numbers are never generated by
2147 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2148 believes these numbers have these meanings.
2149 8 for %eip (no gcc equivalent)
2150 9 for %eflags (gcc regno = 17)
2151 10 for %trapno (no gcc equivalent)
2152 It is not at all clear how we should number the FP stack registers
2153 for the x86 architecture. If the version of SDB on x86/svr4 were
2154 a bit less brain dead with respect to floating-point then we would
2155 have a precedent to follow with respect to DWARF register numbers
2156 for x86 FP registers, but the SDB on x86/svr4 is so completely
2157 broken with respect to FP registers that it is hardly worth thinking
2158 of it as something to strive for compatibility with.
2159 The version of x86/svr4 SDB I have at the moment does (partially)
2160 seem to believe that DWARF register number 11 is associated with
2161 the x86 register %st(0), but that's about all. Higher DWARF
2162 register numbers don't seem to be associated with anything in
2163 particular, and even for DWARF regno 11, SDB only seems to under-
2164 stand that it should say that a variable lives in %st(0) (when
2165 asked via an `=' command) if we said it was in DWARF regno 11,
2166 but SDB still prints garbage when asked for the value of the
2167 variable in question (via a `/' command).
2168 (Also note that the labels SDB prints for various FP stack regs
2169 when doing an `x' command are all wrong.)
2170 Note that these problems generally don't affect the native SVR4
2171 C compiler because it doesn't allow the use of -O with -g and
2172 because when it is *not* optimizing, it allocates a memory
2173 location for each floating-point variable, and the memory
2174 location is what gets described in the DWARF AT_location
2175 attribute for the variable in question.
2176 Regardless of the severe mental illness of the x86/svr4 SDB, we
2177 do something sensible here and we use the following DWARF
2178 register numbers. Note that these are all stack-top-relative
2179 numbers.
2180 11 for %st(0) (gcc regno = 8)
2181 12 for %st(1) (gcc regno = 9)
2182 13 for %st(2) (gcc regno = 10)
2183 14 for %st(3) (gcc regno = 11)
2184 15 for %st(4) (gcc regno = 12)
2185 16 for %st(5) (gcc regno = 13)
2186 17 for %st(6) (gcc regno = 14)
2187 18 for %st(7) (gcc regno = 15)
2189 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2191 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2192 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2193 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2194 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2195 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2196 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2197 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2200 /* Define parameter passing and return registers. */
2202 static int const x86_64_int_parameter_registers[6] =
2204 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2207 static int const x86_64_ms_abi_int_parameter_registers[4] =
2209 CX_REG, DX_REG, R8_REG, R9_REG
2212 static int const x86_64_int_return_registers[4] =
2214 AX_REG, DX_REG, DI_REG, SI_REG
2217 /* Define the structure for the machine field in struct function. */
2219 struct GTY(()) stack_local_entry {
2220 unsigned short mode;
2221 unsigned short n;
2222 rtx rtl;
2223 struct stack_local_entry *next;
2226 /* Structure describing stack frame layout.
2227 Stack grows downward:
2229 [arguments]
2230 <- ARG_POINTER
2231 saved pc
2233 saved static chain if ix86_static_chain_on_stack
2235 saved frame pointer if frame_pointer_needed
2236 <- HARD_FRAME_POINTER
2237 [saved regs]
2238 <- regs_save_offset
2239 [padding0]
2241 [saved SSE regs]
2242 <- sse_regs_save_offset
2243 [padding1] |
2244 | <- FRAME_POINTER
2245 [va_arg registers] |
2247 [frame] |
2249 [padding2] | = to_allocate
2250 <- STACK_POINTER
2252 struct ix86_frame
2254 int nsseregs;
2255 int nregs;
2256 int va_arg_size;
2257 int red_zone_size;
2258 int outgoing_arguments_size;
2260 /* The offsets relative to ARG_POINTER. */
2261 HOST_WIDE_INT frame_pointer_offset;
2262 HOST_WIDE_INT hard_frame_pointer_offset;
2263 HOST_WIDE_INT stack_pointer_offset;
2264 HOST_WIDE_INT hfp_save_offset;
2265 HOST_WIDE_INT reg_save_offset;
2266 HOST_WIDE_INT sse_reg_save_offset;
2268 /* When save_regs_using_mov is set, emit prologue using
2269 move instead of push instructions. */
2270 bool save_regs_using_mov;
2273 /* Which cpu are we scheduling for. */
2274 enum attr_cpu ix86_schedule;
2276 /* Which cpu are we optimizing for. */
2277 enum processor_type ix86_tune;
2279 /* Which instruction set architecture to use. */
2280 enum processor_type ix86_arch;
2282 /* True if processor has SSE prefetch instruction. */
2283 unsigned char x86_prefetch_sse;
2285 /* -mstackrealign option */
2286 static const char ix86_force_align_arg_pointer_string[]
2287 = "force_align_arg_pointer";
2289 static rtx (*ix86_gen_leave) (void);
2290 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2291 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2292 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2293 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2294 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2295 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2296 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2297 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2298 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2299 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2300 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2302 /* Preferred alignment for stack boundary in bits. */
2303 unsigned int ix86_preferred_stack_boundary;
2305 /* Alignment for incoming stack boundary in bits specified at
2306 command line. */
2307 static unsigned int ix86_user_incoming_stack_boundary;
2309 /* Default alignment for incoming stack boundary in bits. */
2310 static unsigned int ix86_default_incoming_stack_boundary;
2312 /* Alignment for incoming stack boundary in bits. */
2313 unsigned int ix86_incoming_stack_boundary;
2315 /* Calling abi specific va_list type nodes. */
2316 static GTY(()) tree sysv_va_list_type_node;
2317 static GTY(()) tree ms_va_list_type_node;
2319 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2320 char internal_label_prefix[16];
2321 int internal_label_prefix_len;
2323 /* Fence to use after loop using movnt. */
2324 tree x86_mfence;
2326 /* Register class used for passing given 64bit part of the argument.
2327 These represent classes as documented by the PS ABI, with the exception
2328 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2329 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2331 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2332 whenever possible (upper half does contain padding). */
2333 enum x86_64_reg_class
2335 X86_64_NO_CLASS,
2336 X86_64_INTEGER_CLASS,
2337 X86_64_INTEGERSI_CLASS,
2338 X86_64_SSE_CLASS,
2339 X86_64_SSESF_CLASS,
2340 X86_64_SSEDF_CLASS,
2341 X86_64_SSEUP_CLASS,
2342 X86_64_X87_CLASS,
2343 X86_64_X87UP_CLASS,
2344 X86_64_COMPLEX_X87_CLASS,
2345 X86_64_MEMORY_CLASS
2348 #define MAX_CLASSES 4
2350 /* Table of constants used by fldpi, fldln2, etc.... */
2351 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2352 static bool ext_80387_constants_init = 0;
2355 static struct machine_function * ix86_init_machine_status (void);
2356 static rtx ix86_function_value (const_tree, const_tree, bool);
2357 static bool ix86_function_value_regno_p (const unsigned int);
2358 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2359 const_tree);
2360 static rtx ix86_static_chain (const_tree, bool);
2361 static int ix86_function_regparm (const_tree, const_tree);
2362 static void ix86_compute_frame_layout (struct ix86_frame *);
2363 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2364 rtx, rtx, int);
2365 static void ix86_add_new_builtins (HOST_WIDE_INT);
2366 static tree ix86_canonical_va_list_type (tree);
2367 static void predict_jump (int);
2368 static unsigned int split_stack_prologue_scratch_regno (void);
2369 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2371 enum ix86_function_specific_strings
2373 IX86_FUNCTION_SPECIFIC_ARCH,
2374 IX86_FUNCTION_SPECIFIC_TUNE,
2375 IX86_FUNCTION_SPECIFIC_MAX
2378 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2379 const char *, enum fpmath_unit, bool);
2380 static void ix86_debug_options (void) ATTRIBUTE_UNUSED;
2381 static void ix86_function_specific_save (struct cl_target_option *);
2382 static void ix86_function_specific_restore (struct cl_target_option *);
2383 static void ix86_function_specific_print (FILE *, int,
2384 struct cl_target_option *);
2385 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2386 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2387 struct gcc_options *);
2388 static bool ix86_can_inline_p (tree, tree);
2389 static void ix86_set_current_function (tree);
2390 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2392 static enum calling_abi ix86_function_abi (const_tree);
2395 #ifndef SUBTARGET32_DEFAULT_CPU
2396 #define SUBTARGET32_DEFAULT_CPU "i386"
2397 #endif
2399 /* Whether -mtune= or -march= were specified */
2400 static int ix86_tune_defaulted;
2401 static int ix86_arch_specified;
2403 /* Vectorization library interface and handlers. */
2404 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2406 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2407 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2409 /* Processor target table, indexed by processor number */
2410 struct ptt
2412 const struct processor_costs *cost; /* Processor costs */
2413 const int align_loop; /* Default alignments. */
2414 const int align_loop_max_skip;
2415 const int align_jump;
2416 const int align_jump_max_skip;
2417 const int align_func;
2420 static const struct ptt processor_target_table[PROCESSOR_max] =
2422 {&i386_cost, 4, 3, 4, 3, 4},
2423 {&i486_cost, 16, 15, 16, 15, 16},
2424 {&pentium_cost, 16, 7, 16, 7, 16},
2425 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2426 {&geode_cost, 0, 0, 0, 0, 0},
2427 {&k6_cost, 32, 7, 32, 7, 32},
2428 {&athlon_cost, 16, 7, 16, 7, 16},
2429 {&pentium4_cost, 0, 0, 0, 0, 0},
2430 {&k8_cost, 16, 7, 16, 7, 16},
2431 {&nocona_cost, 0, 0, 0, 0, 0},
2432 /* Core 2 */
2433 {&core_cost, 16, 10, 16, 10, 16},
2434 /* Core i7 */
2435 {&core_cost, 16, 10, 16, 10, 16},
2436 /* Core avx2 */
2437 {&core_cost, 16, 10, 16, 10, 16},
2438 {&generic32_cost, 16, 7, 16, 7, 16},
2439 {&generic64_cost, 16, 10, 16, 10, 16},
2440 {&amdfam10_cost, 32, 24, 32, 7, 32},
2441 {&bdver1_cost, 32, 24, 32, 7, 32},
2442 {&bdver2_cost, 32, 24, 32, 7, 32},
2443 {&bdver3_cost, 32, 24, 32, 7, 32},
2444 {&btver1_cost, 32, 24, 32, 7, 32},
2445 {&btver2_cost, 32, 24, 32, 7, 32},
2446 {&atom_cost, 16, 15, 16, 7, 16}
2449 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2451 "generic",
2452 "i386",
2453 "i486",
2454 "pentium",
2455 "pentium-mmx",
2456 "pentiumpro",
2457 "pentium2",
2458 "pentium3",
2459 "pentium4",
2460 "pentium-m",
2461 "prescott",
2462 "nocona",
2463 "core2",
2464 "corei7",
2465 "core-avx2",
2466 "atom",
2467 "geode",
2468 "k6",
2469 "k6-2",
2470 "k6-3",
2471 "athlon",
2472 "athlon-4",
2473 "k8",
2474 "amdfam10",
2475 "bdver1",
2476 "bdver2",
2477 "bdver3",
2478 "btver1",
2479 "btver2"
2482 static bool
2483 gate_insert_vzeroupper (void)
2485 return TARGET_VZEROUPPER;
2488 static unsigned int
2489 rest_of_handle_insert_vzeroupper (void)
2491 int i;
2493 /* vzeroupper instructions are inserted immediately after reload to
2494 account for possible spills from 256bit registers. The pass
2495 reuses mode switching infrastructure by re-running mode insertion
2496 pass, so disable entities that have already been processed. */
2497 for (i = 0; i < MAX_386_ENTITIES; i++)
2498 ix86_optimize_mode_switching[i] = 0;
2500 ix86_optimize_mode_switching[AVX_U128] = 1;
2502 /* Call optimize_mode_switching. */
2503 pass_mode_switching.pass.execute ();
2504 return 0;
2507 struct rtl_opt_pass pass_insert_vzeroupper =
2510 RTL_PASS,
2511 "vzeroupper", /* name */
2512 OPTGROUP_NONE, /* optinfo_flags */
2513 gate_insert_vzeroupper, /* gate */
2514 rest_of_handle_insert_vzeroupper, /* execute */
2515 NULL, /* sub */
2516 NULL, /* next */
2517 0, /* static_pass_number */
2518 TV_NONE, /* tv_id */
2519 0, /* properties_required */
2520 0, /* properties_provided */
2521 0, /* properties_destroyed */
2522 0, /* todo_flags_start */
2523 TODO_df_finish | TODO_verify_rtl_sharing |
2524 0, /* todo_flags_finish */
2528 /* Return true if a red-zone is in use. */
2530 static inline bool
2531 ix86_using_red_zone (void)
2533 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2536 /* Return a string that documents the current -m options. The caller is
2537 responsible for freeing the string. */
2539 static char *
2540 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2541 const char *tune, enum fpmath_unit fpmath,
2542 bool add_nl_p)
2544 struct ix86_target_opts
2546 const char *option; /* option string */
2547 HOST_WIDE_INT mask; /* isa mask options */
2550 /* This table is ordered so that options like -msse4.2 that imply
2551 preceding options while match those first. */
2552 static struct ix86_target_opts isa_opts[] =
2554 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2555 { "-mfma", OPTION_MASK_ISA_FMA },
2556 { "-mxop", OPTION_MASK_ISA_XOP },
2557 { "-mlwp", OPTION_MASK_ISA_LWP },
2558 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2559 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2560 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2561 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2562 { "-msse3", OPTION_MASK_ISA_SSE3 },
2563 { "-msse2", OPTION_MASK_ISA_SSE2 },
2564 { "-msse", OPTION_MASK_ISA_SSE },
2565 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2566 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2567 { "-mmmx", OPTION_MASK_ISA_MMX },
2568 { "-mabm", OPTION_MASK_ISA_ABM },
2569 { "-mbmi", OPTION_MASK_ISA_BMI },
2570 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2571 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2572 { "-mhle", OPTION_MASK_ISA_HLE },
2573 { "-mfxsr", OPTION_MASK_ISA_FXSR },
2574 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
2575 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
2576 { "-madx", OPTION_MASK_ISA_ADX },
2577 { "-mtbm", OPTION_MASK_ISA_TBM },
2578 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2579 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2580 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2581 { "-maes", OPTION_MASK_ISA_AES },
2582 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2583 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2584 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2585 { "-mf16c", OPTION_MASK_ISA_F16C },
2586 { "-mrtm", OPTION_MASK_ISA_RTM },
2587 { "-mxsave", OPTION_MASK_ISA_XSAVE },
2588 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
2591 /* Flag options. */
2592 static struct ix86_target_opts flag_opts[] =
2594 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2595 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
2596 { "-m80387", MASK_80387 },
2597 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2598 { "-malign-double", MASK_ALIGN_DOUBLE },
2599 { "-mcld", MASK_CLD },
2600 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2601 { "-mieee-fp", MASK_IEEE_FP },
2602 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2603 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2604 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2605 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2606 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2607 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2608 { "-mno-red-zone", MASK_NO_RED_ZONE },
2609 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2610 { "-mrecip", MASK_RECIP },
2611 { "-mrtd", MASK_RTD },
2612 { "-msseregparm", MASK_SSEREGPARM },
2613 { "-mstack-arg-probe", MASK_STACK_PROBE },
2614 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2615 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2616 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2617 { "-mvzeroupper", MASK_VZEROUPPER },
2618 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2619 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2620 { "-mprefer-avx128", MASK_PREFER_AVX128},
2623 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2625 char isa_other[40];
2626 char target_other[40];
2627 unsigned num = 0;
2628 unsigned i, j;
2629 char *ret;
2630 char *ptr;
2631 size_t len;
2632 size_t line_len;
2633 size_t sep_len;
2634 const char *abi;
2636 memset (opts, '\0', sizeof (opts));
2638 /* Add -march= option. */
2639 if (arch)
2641 opts[num][0] = "-march=";
2642 opts[num++][1] = arch;
2645 /* Add -mtune= option. */
2646 if (tune)
2648 opts[num][0] = "-mtune=";
2649 opts[num++][1] = tune;
2652 /* Add -m32/-m64/-mx32. */
2653 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
2655 if ((isa & OPTION_MASK_ABI_64) != 0)
2656 abi = "-m64";
2657 else
2658 abi = "-mx32";
2659 isa &= ~ (OPTION_MASK_ISA_64BIT
2660 | OPTION_MASK_ABI_64
2661 | OPTION_MASK_ABI_X32);
2663 else
2664 abi = "-m32";
2665 opts[num++][0] = abi;
2667 /* Pick out the options in isa options. */
2668 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2670 if ((isa & isa_opts[i].mask) != 0)
2672 opts[num++][0] = isa_opts[i].option;
2673 isa &= ~ isa_opts[i].mask;
2677 if (isa && add_nl_p)
2679 opts[num++][0] = isa_other;
2680 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2681 isa);
2684 /* Add flag options. */
2685 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2687 if ((flags & flag_opts[i].mask) != 0)
2689 opts[num++][0] = flag_opts[i].option;
2690 flags &= ~ flag_opts[i].mask;
2694 if (flags && add_nl_p)
2696 opts[num++][0] = target_other;
2697 sprintf (target_other, "(other flags: %#x)", flags);
2700 /* Add -fpmath= option. */
2701 if (fpmath)
2703 opts[num][0] = "-mfpmath=";
2704 switch ((int) fpmath)
2706 case FPMATH_387:
2707 opts[num++][1] = "387";
2708 break;
2710 case FPMATH_SSE:
2711 opts[num++][1] = "sse";
2712 break;
2714 case FPMATH_387 | FPMATH_SSE:
2715 opts[num++][1] = "sse+387";
2716 break;
2718 default:
2719 gcc_unreachable ();
2723 /* Any options? */
2724 if (num == 0)
2725 return NULL;
2727 gcc_assert (num < ARRAY_SIZE (opts));
2729 /* Size the string. */
2730 len = 0;
2731 sep_len = (add_nl_p) ? 3 : 1;
2732 for (i = 0; i < num; i++)
2734 len += sep_len;
2735 for (j = 0; j < 2; j++)
2736 if (opts[i][j])
2737 len += strlen (opts[i][j]);
2740 /* Build the string. */
2741 ret = ptr = (char *) xmalloc (len);
2742 line_len = 0;
2744 for (i = 0; i < num; i++)
2746 size_t len2[2];
2748 for (j = 0; j < 2; j++)
2749 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2751 if (i != 0)
2753 *ptr++ = ' ';
2754 line_len++;
2756 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2758 *ptr++ = '\\';
2759 *ptr++ = '\n';
2760 line_len = 0;
2764 for (j = 0; j < 2; j++)
2765 if (opts[i][j])
2767 memcpy (ptr, opts[i][j], len2[j]);
2768 ptr += len2[j];
2769 line_len += len2[j];
2773 *ptr = '\0';
2774 gcc_assert (ret + len >= ptr);
2776 return ret;
2779 /* Return true, if profiling code should be emitted before
2780 prologue. Otherwise it returns false.
2781 Note: For x86 with "hotfix" it is sorried. */
2782 static bool
2783 ix86_profile_before_prologue (void)
2785 return flag_fentry != 0;
2788 /* Function that is callable from the debugger to print the current
2789 options. */
2790 void
2791 ix86_debug_options (void)
2793 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2794 ix86_arch_string, ix86_tune_string,
2795 ix86_fpmath, true);
2797 if (opts)
2799 fprintf (stderr, "%s\n\n", opts);
2800 free (opts);
2802 else
2803 fputs ("<no options>\n\n", stderr);
2805 return;
2808 /* Override various settings based on options. If MAIN_ARGS_P, the
2809 options are from the command line, otherwise they are from
2810 attributes. */
2812 static void
2813 ix86_option_override_internal (bool main_args_p)
2815 int i;
2816 unsigned int ix86_arch_mask, ix86_tune_mask;
2817 const bool ix86_tune_specified = (ix86_tune_string != NULL);
2818 const char *prefix;
2819 const char *suffix;
2820 const char *sw;
2822 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
2823 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
2824 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
2825 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
2826 #define PTA_AES (HOST_WIDE_INT_1 << 4)
2827 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
2828 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
2829 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
2830 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
2831 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
2832 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
2833 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
2834 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
2835 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
2836 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
2837 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
2838 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
2839 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
2840 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
2841 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
2842 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
2843 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
2844 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
2845 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
2846 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
2847 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
2848 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
2849 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
2850 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
2851 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
2852 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
2853 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
2854 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
2855 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
2856 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
2857 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
2858 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
2859 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
2860 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
2861 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
2863 /* if this reaches 64, need to widen struct pta flags below */
2865 static struct pta
2867 const char *const name; /* processor name or nickname. */
2868 const enum processor_type processor;
2869 const enum attr_cpu schedule;
2870 const unsigned HOST_WIDE_INT flags;
2872 const processor_alias_table[] =
2874 {"i386", PROCESSOR_I386, CPU_NONE, 0},
2875 {"i486", PROCESSOR_I486, CPU_NONE, 0},
2876 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2877 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2878 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
2879 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
2880 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2881 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2882 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_SSE},
2883 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2884 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2885 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
2886 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2887 PTA_MMX | PTA_SSE | PTA_FXSR},
2888 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2889 PTA_MMX | PTA_SSE | PTA_FXSR},
2890 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2891 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
2892 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
2893 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
2894 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
2895 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
2896 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
2897 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
2898 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
2899 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2900 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
2901 {"core2", PROCESSOR_CORE2, CPU_CORE2,
2902 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2903 | PTA_SSSE3 | PTA_CX16 | PTA_FXSR},
2904 {"corei7", PROCESSOR_COREI7, CPU_COREI7,
2905 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2906 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_FXSR},
2907 {"corei7-avx", PROCESSOR_COREI7, CPU_COREI7,
2908 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2909 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2910 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL
2911 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
2912 {"core-avx-i", PROCESSOR_COREI7, CPU_COREI7,
2913 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2914 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2915 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2916 | PTA_RDRND | PTA_F16C | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
2917 {"core-avx2", PROCESSOR_HASWELL, CPU_COREI7,
2918 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2919 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
2920 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2921 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
2922 | PTA_FMA | PTA_MOVBE | PTA_RTM | PTA_HLE | PTA_FXSR | PTA_XSAVE
2923 | PTA_XSAVEOPT},
2924 {"atom", PROCESSOR_ATOM, CPU_ATOM,
2925 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2926 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE | PTA_FXSR},
2927 {"geode", PROCESSOR_GEODE, CPU_GEODE,
2928 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
2929 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
2930 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
2931 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
2932 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
2933 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
2934 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
2935 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
2936 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
2937 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
2938 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
2939 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
2940 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
2941 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
2942 {"x86-64", PROCESSOR_K8, CPU_K8,
2943 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF},
2944 {"k8", PROCESSOR_K8, CPU_K8,
2945 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2946 | PTA_SSE2 | PTA_NO_SAHF},
2947 {"k8-sse3", PROCESSOR_K8, CPU_K8,
2948 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2949 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
2950 {"opteron", PROCESSOR_K8, CPU_K8,
2951 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2952 | PTA_SSE2 | PTA_NO_SAHF},
2953 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
2954 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2955 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
2956 {"athlon64", PROCESSOR_K8, CPU_K8,
2957 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2958 | PTA_SSE2 | PTA_NO_SAHF},
2959 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
2960 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2961 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
2962 {"athlon-fx", PROCESSOR_K8, CPU_K8,
2963 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2964 | PTA_SSE2 | PTA_NO_SAHF},
2965 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
2966 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2967 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
2968 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
2969 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2970 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
2971 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
2972 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2973 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
2974 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
2975 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
2976 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
2977 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2978 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
2979 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
2980 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
2981 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
2982 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
2983 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2984 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
2985 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
2986 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
2987 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
2988 | PTA_XSAVEOPT},
2989 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC64,
2990 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2991 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
2992 | PTA_FXSR | PTA_XSAVE},
2993 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
2994 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2995 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
2996 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
2997 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
2998 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3000 {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO,
3001 PTA_HLE /* flags are only used for -march switch. */ },
3002 {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64,
3003 PTA_64BIT
3004 | PTA_HLE /* flags are only used for -march switch. */ },
3007 /* -mrecip options. */
3008 static struct
3010 const char *string; /* option name */
3011 unsigned int mask; /* mask bits to set */
3013 const recip_options[] =
3015 { "all", RECIP_MASK_ALL },
3016 { "none", RECIP_MASK_NONE },
3017 { "div", RECIP_MASK_DIV },
3018 { "sqrt", RECIP_MASK_SQRT },
3019 { "vec-div", RECIP_MASK_VEC_DIV },
3020 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3023 int const pta_size = ARRAY_SIZE (processor_alias_table);
3025 /* Set up prefix/suffix so the error messages refer to either the command
3026 line argument, or the attribute(target). */
3027 if (main_args_p)
3029 prefix = "-m";
3030 suffix = "";
3031 sw = "switch";
3033 else
3035 prefix = "option(\"";
3036 suffix = "\")";
3037 sw = "attribute";
3040 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
3041 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
3042 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT)
3043 ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
3044 #ifdef TARGET_BI_ARCH
3045 else
3047 #if TARGET_BI_ARCH == 1
3048 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
3049 is on and OPTION_MASK_ABI_X32 is off. We turn off
3050 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
3051 -mx32. */
3052 if (TARGET_X32)
3053 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3054 #else
3055 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
3056 on and OPTION_MASK_ABI_64 is off. We turn off
3057 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
3058 -m64. */
3059 if (TARGET_LP64)
3060 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3061 #endif
3063 #endif
3065 if (TARGET_X32)
3067 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3068 OPTION_MASK_ABI_64 for TARGET_X32. */
3069 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3070 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3072 else if (TARGET_LP64)
3074 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3075 OPTION_MASK_ABI_X32 for TARGET_LP64. */
3076 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3077 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3080 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3081 SUBTARGET_OVERRIDE_OPTIONS;
3082 #endif
3084 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3085 SUBSUBTARGET_OVERRIDE_OPTIONS;
3086 #endif
3088 /* -fPIC is the default for x86_64. */
3089 if (TARGET_MACHO && TARGET_64BIT)
3090 flag_pic = 2;
3092 /* Need to check -mtune=generic first. */
3093 if (ix86_tune_string)
3095 if (!strcmp (ix86_tune_string, "generic")
3096 || !strcmp (ix86_tune_string, "i686")
3097 /* As special support for cross compilers we read -mtune=native
3098 as -mtune=generic. With native compilers we won't see the
3099 -mtune=native, as it was changed by the driver. */
3100 || !strcmp (ix86_tune_string, "native"))
3102 if (TARGET_64BIT)
3103 ix86_tune_string = "generic64";
3104 else
3105 ix86_tune_string = "generic32";
3107 /* If this call is for setting the option attribute, allow the
3108 generic32/generic64 that was previously set. */
3109 else if (!main_args_p
3110 && (!strcmp (ix86_tune_string, "generic32")
3111 || !strcmp (ix86_tune_string, "generic64")))
3113 else if (!strncmp (ix86_tune_string, "generic", 7))
3114 error ("bad value (%s) for %stune=%s %s",
3115 ix86_tune_string, prefix, suffix, sw);
3116 else if (!strcmp (ix86_tune_string, "x86-64"))
3117 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3118 "%stune=k8%s or %stune=generic%s instead as appropriate",
3119 prefix, suffix, prefix, suffix, prefix, suffix);
3121 else
3123 if (ix86_arch_string)
3124 ix86_tune_string = ix86_arch_string;
3125 if (!ix86_tune_string)
3127 ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3128 ix86_tune_defaulted = 1;
3131 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
3132 need to use a sensible tune option. */
3133 if (!strcmp (ix86_tune_string, "generic")
3134 || !strcmp (ix86_tune_string, "x86-64")
3135 || !strcmp (ix86_tune_string, "i686"))
3137 if (TARGET_64BIT)
3138 ix86_tune_string = "generic64";
3139 else
3140 ix86_tune_string = "generic32";
3144 if (ix86_stringop_alg == rep_prefix_8_byte && !TARGET_64BIT)
3146 /* rep; movq isn't available in 32-bit code. */
3147 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3148 ix86_stringop_alg = no_stringop;
3151 if (!ix86_arch_string)
3152 ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3153 else
3154 ix86_arch_specified = 1;
3156 if (global_options_set.x_ix86_pmode)
3158 if ((TARGET_LP64 && ix86_pmode == PMODE_SI)
3159 || (!TARGET_64BIT && ix86_pmode == PMODE_DI))
3160 error ("address mode %qs not supported in the %s bit mode",
3161 TARGET_64BIT ? "short" : "long",
3162 TARGET_64BIT ? "64" : "32");
3164 else
3165 ix86_pmode = TARGET_LP64 ? PMODE_DI : PMODE_SI;
3167 if (!global_options_set.x_ix86_abi)
3168 ix86_abi = DEFAULT_ABI;
3170 if (global_options_set.x_ix86_cmodel)
3172 switch (ix86_cmodel)
3174 case CM_SMALL:
3175 case CM_SMALL_PIC:
3176 if (flag_pic)
3177 ix86_cmodel = CM_SMALL_PIC;
3178 if (!TARGET_64BIT)
3179 error ("code model %qs not supported in the %s bit mode",
3180 "small", "32");
3181 break;
3183 case CM_MEDIUM:
3184 case CM_MEDIUM_PIC:
3185 if (flag_pic)
3186 ix86_cmodel = CM_MEDIUM_PIC;
3187 if (!TARGET_64BIT)
3188 error ("code model %qs not supported in the %s bit mode",
3189 "medium", "32");
3190 else if (TARGET_X32)
3191 error ("code model %qs not supported in x32 mode",
3192 "medium");
3193 break;
3195 case CM_LARGE:
3196 case CM_LARGE_PIC:
3197 if (flag_pic)
3198 ix86_cmodel = CM_LARGE_PIC;
3199 if (!TARGET_64BIT)
3200 error ("code model %qs not supported in the %s bit mode",
3201 "large", "32");
3202 else if (TARGET_X32)
3203 error ("code model %qs not supported in x32 mode",
3204 "large");
3205 break;
3207 case CM_32:
3208 if (flag_pic)
3209 error ("code model %s does not support PIC mode", "32");
3210 if (TARGET_64BIT)
3211 error ("code model %qs not supported in the %s bit mode",
3212 "32", "64");
3213 break;
3215 case CM_KERNEL:
3216 if (flag_pic)
3218 error ("code model %s does not support PIC mode", "kernel");
3219 ix86_cmodel = CM_32;
3221 if (!TARGET_64BIT)
3222 error ("code model %qs not supported in the %s bit mode",
3223 "kernel", "32");
3224 break;
3226 default:
3227 gcc_unreachable ();
3230 else
3232 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3233 use of rip-relative addressing. This eliminates fixups that
3234 would otherwise be needed if this object is to be placed in a
3235 DLL, and is essentially just as efficient as direct addressing. */
3236 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
3237 ix86_cmodel = CM_SMALL_PIC, flag_pic = 1;
3238 else if (TARGET_64BIT && TARGET_RDOS)
3239 ix86_cmodel = CM_MEDIUM_PIC, flag_pic = 1;
3240 else if (TARGET_64BIT)
3241 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
3242 else
3243 ix86_cmodel = CM_32;
3245 if (TARGET_MACHO && ix86_asm_dialect == ASM_INTEL)
3247 error ("-masm=intel not supported in this configuration");
3248 ix86_asm_dialect = ASM_ATT;
3250 if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3251 sorry ("%i-bit mode not compiled in",
3252 (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3254 for (i = 0; i < pta_size; i++)
3255 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
3257 ix86_schedule = processor_alias_table[i].schedule;
3258 ix86_arch = processor_alias_table[i].processor;
3259 /* Default cpu tuning to the architecture. */
3260 ix86_tune = ix86_arch;
3262 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
3263 error ("CPU you selected does not support x86-64 "
3264 "instruction set");
3266 if (processor_alias_table[i].flags & PTA_MMX
3267 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3268 ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3269 if (processor_alias_table[i].flags & PTA_3DNOW
3270 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3271 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3272 if (processor_alias_table[i].flags & PTA_3DNOW_A
3273 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3274 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3275 if (processor_alias_table[i].flags & PTA_SSE
3276 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3277 ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3278 if (processor_alias_table[i].flags & PTA_SSE2
3279 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3280 ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3281 if (processor_alias_table[i].flags & PTA_SSE3
3282 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3283 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3284 if (processor_alias_table[i].flags & PTA_SSSE3
3285 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3286 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3287 if (processor_alias_table[i].flags & PTA_SSE4_1
3288 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3289 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3290 if (processor_alias_table[i].flags & PTA_SSE4_2
3291 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3292 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3293 if (processor_alias_table[i].flags & PTA_AVX
3294 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3295 ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3296 if (processor_alias_table[i].flags & PTA_AVX2
3297 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3298 ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3299 if (processor_alias_table[i].flags & PTA_FMA
3300 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3301 ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3302 if (processor_alias_table[i].flags & PTA_SSE4A
3303 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3304 ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3305 if (processor_alias_table[i].flags & PTA_FMA4
3306 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3307 ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3308 if (processor_alias_table[i].flags & PTA_XOP
3309 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3310 ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3311 if (processor_alias_table[i].flags & PTA_LWP
3312 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3313 ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3314 if (processor_alias_table[i].flags & PTA_ABM
3315 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3316 ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3317 if (processor_alias_table[i].flags & PTA_BMI
3318 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3319 ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3320 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3321 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3322 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3323 if (processor_alias_table[i].flags & PTA_TBM
3324 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3325 ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3326 if (processor_alias_table[i].flags & PTA_BMI2
3327 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3328 ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3329 if (processor_alias_table[i].flags & PTA_CX16
3330 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3331 ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3332 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3333 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3334 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3335 if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF))
3336 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3337 ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3338 if (processor_alias_table[i].flags & PTA_MOVBE
3339 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3340 ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3341 if (processor_alias_table[i].flags & PTA_AES
3342 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3343 ix86_isa_flags |= OPTION_MASK_ISA_AES;
3344 if (processor_alias_table[i].flags & PTA_PCLMUL
3345 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3346 ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3347 if (processor_alias_table[i].flags & PTA_FSGSBASE
3348 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3349 ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3350 if (processor_alias_table[i].flags & PTA_RDRND
3351 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3352 ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3353 if (processor_alias_table[i].flags & PTA_F16C
3354 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3355 ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3356 if (processor_alias_table[i].flags & PTA_RTM
3357 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
3358 ix86_isa_flags |= OPTION_MASK_ISA_RTM;
3359 if (processor_alias_table[i].flags & PTA_HLE
3360 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
3361 ix86_isa_flags |= OPTION_MASK_ISA_HLE;
3362 if (processor_alias_table[i].flags & PTA_PRFCHW
3363 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
3364 ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
3365 if (processor_alias_table[i].flags & PTA_RDSEED
3366 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
3367 ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
3368 if (processor_alias_table[i].flags & PTA_ADX
3369 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
3370 ix86_isa_flags |= OPTION_MASK_ISA_ADX;
3371 if (processor_alias_table[i].flags & PTA_FXSR
3372 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
3373 ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
3374 if (processor_alias_table[i].flags & PTA_XSAVE
3375 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
3376 ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
3377 if (processor_alias_table[i].flags & PTA_XSAVEOPT
3378 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
3379 ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
3380 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3381 x86_prefetch_sse = true;
3383 break;
3386 if (!strcmp (ix86_arch_string, "generic"))
3387 error ("generic CPU can be used only for %stune=%s %s",
3388 prefix, suffix, sw);
3389 else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size)
3390 error ("bad value (%s) for %sarch=%s %s",
3391 ix86_arch_string, prefix, suffix, sw);
3393 ix86_arch_mask = 1u << ix86_arch;
3394 for (i = 0; i < X86_ARCH_LAST; ++i)
3395 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3397 for (i = 0; i < pta_size; i++)
3398 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
3400 ix86_schedule = processor_alias_table[i].schedule;
3401 ix86_tune = processor_alias_table[i].processor;
3402 if (TARGET_64BIT)
3404 if (!(processor_alias_table[i].flags & PTA_64BIT))
3406 if (ix86_tune_defaulted)
3408 ix86_tune_string = "x86-64";
3409 for (i = 0; i < pta_size; i++)
3410 if (! strcmp (ix86_tune_string,
3411 processor_alias_table[i].name))
3412 break;
3413 ix86_schedule = processor_alias_table[i].schedule;
3414 ix86_tune = processor_alias_table[i].processor;
3416 else
3417 error ("CPU you selected does not support x86-64 "
3418 "instruction set");
3421 else
3423 /* Adjust tuning when compiling for 32-bit ABI. */
3424 switch (ix86_tune)
3426 case PROCESSOR_GENERIC64:
3427 ix86_tune = PROCESSOR_GENERIC32;
3428 ix86_schedule = CPU_PENTIUMPRO;
3429 break;
3431 default:
3432 break;
3435 /* Intel CPUs have always interpreted SSE prefetch instructions as
3436 NOPs; so, we can enable SSE prefetch instructions even when
3437 -mtune (rather than -march) points us to a processor that has them.
3438 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3439 higher processors. */
3440 if (TARGET_CMOV
3441 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3442 x86_prefetch_sse = true;
3443 break;
3446 if (ix86_tune_specified && i == pta_size)
3447 error ("bad value (%s) for %stune=%s %s",
3448 ix86_tune_string, prefix, suffix, sw);
3450 ix86_tune_mask = 1u << ix86_tune;
3451 for (i = 0; i < X86_TUNE_LAST; ++i)
3452 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3454 #ifndef USE_IX86_FRAME_POINTER
3455 #define USE_IX86_FRAME_POINTER 0
3456 #endif
3458 #ifndef USE_X86_64_FRAME_POINTER
3459 #define USE_X86_64_FRAME_POINTER 0
3460 #endif
3462 /* Set the default values for switches whose default depends on TARGET_64BIT
3463 in case they weren't overwritten by command line options. */
3464 if (TARGET_64BIT)
3466 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3467 flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3468 if (flag_asynchronous_unwind_tables == 2)
3469 flag_unwind_tables = flag_asynchronous_unwind_tables = 1;
3470 if (flag_pcc_struct_return == 2)
3471 flag_pcc_struct_return = 0;
3473 else
3475 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3476 flag_omit_frame_pointer = !(USE_IX86_FRAME_POINTER || optimize_size);
3477 if (flag_asynchronous_unwind_tables == 2)
3478 flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3479 if (flag_pcc_struct_return == 2)
3480 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3483 ix86_tune_cost = processor_target_table[ix86_tune].cost;
3484 if (optimize_size)
3485 ix86_cost = &ix86_size_cost;
3486 else
3487 ix86_cost = ix86_tune_cost;
3489 /* Arrange to set up i386_stack_locals for all functions. */
3490 init_machine_status = ix86_init_machine_status;
3492 /* Validate -mregparm= value. */
3493 if (global_options_set.x_ix86_regparm)
3495 if (TARGET_64BIT)
3496 warning (0, "-mregparm is ignored in 64-bit mode");
3497 if (ix86_regparm > REGPARM_MAX)
3499 error ("-mregparm=%d is not between 0 and %d",
3500 ix86_regparm, REGPARM_MAX);
3501 ix86_regparm = 0;
3504 if (TARGET_64BIT)
3505 ix86_regparm = REGPARM_MAX;
3507 /* Default align_* from the processor table. */
3508 if (align_loops == 0)
3510 align_loops = processor_target_table[ix86_tune].align_loop;
3511 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3513 if (align_jumps == 0)
3515 align_jumps = processor_target_table[ix86_tune].align_jump;
3516 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3518 if (align_functions == 0)
3520 align_functions = processor_target_table[ix86_tune].align_func;
3523 /* Provide default for -mbranch-cost= value. */
3524 if (!global_options_set.x_ix86_branch_cost)
3525 ix86_branch_cost = ix86_cost->branch_cost;
3527 if (TARGET_64BIT)
3529 target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit;
3531 /* Enable by default the SSE and MMX builtins. Do allow the user to
3532 explicitly disable any of these. In particular, disabling SSE and
3533 MMX for kernel code is extremely useful. */
3534 if (!ix86_arch_specified)
3535 ix86_isa_flags
3536 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3537 | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit);
3539 if (TARGET_RTD)
3540 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3542 else
3544 target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit;
3546 if (!ix86_arch_specified)
3547 ix86_isa_flags
3548 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit;
3550 /* i386 ABI does not specify red zone. It still makes sense to use it
3551 when programmer takes care to stack from being destroyed. */
3552 if (!(target_flags_explicit & MASK_NO_RED_ZONE))
3553 target_flags |= MASK_NO_RED_ZONE;
3556 /* Keep nonleaf frame pointers. */
3557 if (flag_omit_frame_pointer)
3558 target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3559 else if (TARGET_OMIT_LEAF_FRAME_POINTER)
3560 flag_omit_frame_pointer = 1;
3562 /* If we're doing fast math, we don't care about comparison order
3563 wrt NaNs. This lets us use a shorter comparison sequence. */
3564 if (flag_finite_math_only)
3565 target_flags &= ~MASK_IEEE_FP;
3567 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3568 since the insns won't need emulation. */
3569 if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
3570 target_flags &= ~MASK_NO_FANCY_MATH_387;
3572 /* Likewise, if the target doesn't have a 387, or we've specified
3573 software floating point, don't use 387 inline intrinsics. */
3574 if (!TARGET_80387)
3575 target_flags |= MASK_NO_FANCY_MATH_387;
3577 /* Turn on MMX builtins for -msse. */
3578 if (TARGET_SSE)
3579 ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit;
3581 /* Enable SSE prefetch. */
3582 if (TARGET_SSE || TARGET_PRFCHW)
3583 x86_prefetch_sse = true;
3585 /* Turn on popcnt instruction for -msse4.2 or -mabm. */
3586 if (TARGET_SSE4_2 || TARGET_ABM)
3587 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT & ~ix86_isa_flags_explicit;
3589 /* Turn on lzcnt instruction for -mabm. */
3590 if (TARGET_ABM)
3591 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT & ~ix86_isa_flags_explicit;
3593 /* Validate -mpreferred-stack-boundary= value or default it to
3594 PREFERRED_STACK_BOUNDARY_DEFAULT. */
3595 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3596 if (global_options_set.x_ix86_preferred_stack_boundary_arg)
3598 int min = (TARGET_64BIT ? (TARGET_SSE ? 4 : 3) : 2);
3599 int max = (TARGET_SEH ? 4 : 12);
3601 if (ix86_preferred_stack_boundary_arg < min
3602 || ix86_preferred_stack_boundary_arg > max)
3604 if (min == max)
3605 error ("-mpreferred-stack-boundary is not supported "
3606 "for this target");
3607 else
3608 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3609 ix86_preferred_stack_boundary_arg, min, max);
3611 else
3612 ix86_preferred_stack_boundary
3613 = (1 << ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3616 /* Set the default value for -mstackrealign. */
3617 if (ix86_force_align_arg_pointer == -1)
3618 ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3620 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3622 /* Validate -mincoming-stack-boundary= value or default it to
3623 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
3624 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3625 if (global_options_set.x_ix86_incoming_stack_boundary_arg)
3627 if (ix86_incoming_stack_boundary_arg < (TARGET_64BIT ? 4 : 2)
3628 || ix86_incoming_stack_boundary_arg > 12)
3629 error ("-mincoming-stack-boundary=%d is not between %d and 12",
3630 ix86_incoming_stack_boundary_arg, TARGET_64BIT ? 4 : 2);
3631 else
3633 ix86_user_incoming_stack_boundary
3634 = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3635 ix86_incoming_stack_boundary
3636 = ix86_user_incoming_stack_boundary;
3640 /* Accept -msseregparm only if at least SSE support is enabled. */
3641 if (TARGET_SSEREGPARM
3642 && ! TARGET_SSE)
3643 error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3645 if (global_options_set.x_ix86_fpmath)
3647 if (ix86_fpmath & FPMATH_SSE)
3649 if (!TARGET_SSE)
3651 warning (0, "SSE instruction set disabled, using 387 arithmetics");
3652 ix86_fpmath = FPMATH_387;
3654 else if ((ix86_fpmath & FPMATH_387) && !TARGET_80387)
3656 warning (0, "387 instruction set disabled, using SSE arithmetics");
3657 ix86_fpmath = FPMATH_SSE;
3661 else
3662 ix86_fpmath = TARGET_FPMATH_DEFAULT;
3664 /* If the i387 is disabled, then do not return values in it. */
3665 if (!TARGET_80387)
3666 target_flags &= ~MASK_FLOAT_RETURNS;
3668 /* Use external vectorized library in vectorizing intrinsics. */
3669 if (global_options_set.x_ix86_veclibabi_type)
3670 switch (ix86_veclibabi_type)
3672 case ix86_veclibabi_type_svml:
3673 ix86_veclib_handler = ix86_veclibabi_svml;
3674 break;
3676 case ix86_veclibabi_type_acml:
3677 ix86_veclib_handler = ix86_veclibabi_acml;
3678 break;
3680 default:
3681 gcc_unreachable ();
3684 if ((!USE_IX86_FRAME_POINTER
3685 || (x86_accumulate_outgoing_args & ix86_tune_mask))
3686 && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3687 && !optimize_size)
3688 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3690 /* ??? Unwind info is not correct around the CFG unless either a frame
3691 pointer is present or M_A_O_A is set. Fixing this requires rewriting
3692 unwind info generation to be aware of the CFG and propagating states
3693 around edges. */
3694 if ((flag_unwind_tables || flag_asynchronous_unwind_tables
3695 || flag_exceptions || flag_non_call_exceptions)
3696 && flag_omit_frame_pointer
3697 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3699 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3700 warning (0, "unwind tables currently require either a frame pointer "
3701 "or %saccumulate-outgoing-args%s for correctness",
3702 prefix, suffix);
3703 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3706 /* If stack probes are required, the space used for large function
3707 arguments on the stack must also be probed, so enable
3708 -maccumulate-outgoing-args so this happens in the prologue. */
3709 if (TARGET_STACK_PROBE
3710 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3712 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3713 warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3714 "for correctness", prefix, suffix);
3715 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3718 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
3720 char *p;
3721 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3722 p = strchr (internal_label_prefix, 'X');
3723 internal_label_prefix_len = p - internal_label_prefix;
3724 *p = '\0';
3727 /* When scheduling description is not available, disable scheduler pass
3728 so it won't slow down the compilation and make x87 code slower. */
3729 if (!TARGET_SCHEDULE)
3730 flag_schedule_insns_after_reload = flag_schedule_insns = 0;
3732 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3733 ix86_tune_cost->simultaneous_prefetches,
3734 global_options.x_param_values,
3735 global_options_set.x_param_values);
3736 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
3737 ix86_tune_cost->prefetch_block,
3738 global_options.x_param_values,
3739 global_options_set.x_param_values);
3740 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
3741 ix86_tune_cost->l1_cache_size,
3742 global_options.x_param_values,
3743 global_options_set.x_param_values);
3744 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
3745 ix86_tune_cost->l2_cache_size,
3746 global_options.x_param_values,
3747 global_options_set.x_param_values);
3749 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
3750 if (flag_prefetch_loop_arrays < 0
3751 && HAVE_prefetch
3752 && (optimize >= 3 || flag_profile_use)
3753 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
3754 flag_prefetch_loop_arrays = 1;
3756 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
3757 can be optimized to ap = __builtin_next_arg (0). */
3758 if (!TARGET_64BIT && !flag_split_stack)
3759 targetm.expand_builtin_va_start = NULL;
3761 if (TARGET_64BIT)
3763 ix86_gen_leave = gen_leave_rex64;
3764 if (Pmode == DImode)
3766 ix86_gen_monitor = gen_sse3_monitor64_di;
3767 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
3768 ix86_gen_tls_local_dynamic_base_64
3769 = gen_tls_local_dynamic_base_64_di;
3771 else
3773 ix86_gen_monitor = gen_sse3_monitor64_si;
3774 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
3775 ix86_gen_tls_local_dynamic_base_64
3776 = gen_tls_local_dynamic_base_64_si;
3779 else
3781 ix86_gen_leave = gen_leave;
3782 ix86_gen_monitor = gen_sse3_monitor;
3785 if (Pmode == DImode)
3787 ix86_gen_add3 = gen_adddi3;
3788 ix86_gen_sub3 = gen_subdi3;
3789 ix86_gen_sub3_carry = gen_subdi3_carry;
3790 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
3791 ix86_gen_andsp = gen_anddi3;
3792 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
3793 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
3794 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
3796 else
3798 ix86_gen_add3 = gen_addsi3;
3799 ix86_gen_sub3 = gen_subsi3;
3800 ix86_gen_sub3_carry = gen_subsi3_carry;
3801 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
3802 ix86_gen_andsp = gen_andsi3;
3803 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
3804 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
3805 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
3808 #ifdef USE_IX86_CLD
3809 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
3810 if (!TARGET_64BIT)
3811 target_flags |= MASK_CLD & ~target_flags_explicit;
3812 #endif
3814 if (!TARGET_64BIT && flag_pic)
3816 if (flag_fentry > 0)
3817 sorry ("-mfentry isn%'t supported for 32-bit in combination "
3818 "with -fpic");
3819 flag_fentry = 0;
3821 else if (TARGET_SEH)
3823 if (flag_fentry == 0)
3824 sorry ("-mno-fentry isn%'t compatible with SEH");
3825 flag_fentry = 1;
3827 else if (flag_fentry < 0)
3829 #if defined(PROFILE_BEFORE_PROLOGUE)
3830 flag_fentry = 1;
3831 #else
3832 flag_fentry = 0;
3833 #endif
3836 if (TARGET_AVX)
3838 /* When not optimize for size, enable vzeroupper optimization for
3839 TARGET_AVX with -fexpensive-optimizations and split 32-byte
3840 AVX unaligned load/store. */
3841 if (!optimize_size)
3843 if (flag_expensive_optimizations
3844 && !(target_flags_explicit & MASK_VZEROUPPER))
3845 target_flags |= MASK_VZEROUPPER;
3846 if ((x86_avx256_split_unaligned_load & ix86_tune_mask)
3847 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
3848 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
3849 if ((x86_avx256_split_unaligned_store & ix86_tune_mask)
3850 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE))
3851 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
3852 /* Enable 128-bit AVX instruction generation
3853 for the auto-vectorizer. */
3854 if (TARGET_AVX128_OPTIMAL
3855 && !(target_flags_explicit & MASK_PREFER_AVX128))
3856 target_flags |= MASK_PREFER_AVX128;
3859 else
3861 /* Disable vzeroupper pass if TARGET_AVX is disabled. */
3862 target_flags &= ~MASK_VZEROUPPER;
3865 if (ix86_recip_name)
3867 char *p = ASTRDUP (ix86_recip_name);
3868 char *q;
3869 unsigned int mask, i;
3870 bool invert;
3872 while ((q = strtok (p, ",")) != NULL)
3874 p = NULL;
3875 if (*q == '!')
3877 invert = true;
3878 q++;
3880 else
3881 invert = false;
3883 if (!strcmp (q, "default"))
3884 mask = RECIP_MASK_ALL;
3885 else
3887 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
3888 if (!strcmp (q, recip_options[i].string))
3890 mask = recip_options[i].mask;
3891 break;
3894 if (i == ARRAY_SIZE (recip_options))
3896 error ("unknown option for -mrecip=%s", q);
3897 invert = false;
3898 mask = RECIP_MASK_NONE;
3902 recip_mask_explicit |= mask;
3903 if (invert)
3904 recip_mask &= ~mask;
3905 else
3906 recip_mask |= mask;
3910 if (TARGET_RECIP)
3911 recip_mask |= RECIP_MASK_ALL & ~recip_mask_explicit;
3912 else if (target_flags_explicit & MASK_RECIP)
3913 recip_mask &= ~(RECIP_MASK_ALL & ~recip_mask_explicit);
3915 /* Default long double to 64-bit for Bionic. */
3916 if (TARGET_HAS_BIONIC
3917 && !(target_flags_explicit & MASK_LONG_DOUBLE_64))
3918 target_flags |= MASK_LONG_DOUBLE_64;
3920 /* Save the initial options in case the user does function specific
3921 options. */
3922 if (main_args_p)
3923 target_option_default_node = target_option_current_node
3924 = build_target_option_node ();
3927 /* Implement the TARGET_OPTION_OVERRIDE hook. */
3929 static void
3930 ix86_option_override (void)
3932 static struct register_pass_info insert_vzeroupper_info
3933 = { &pass_insert_vzeroupper.pass, "reload",
3934 1, PASS_POS_INSERT_AFTER
3937 ix86_option_override_internal (true);
3940 /* This needs to be done at start up. It's convenient to do it here. */
3941 register_pass (&insert_vzeroupper_info);
3944 /* Update register usage after having seen the compiler flags. */
3946 static void
3947 ix86_conditional_register_usage (void)
3949 int i, c_mask;
3950 unsigned int j;
3952 /* The PIC register, if it exists, is fixed. */
3953 j = PIC_OFFSET_TABLE_REGNUM;
3954 if (j != INVALID_REGNUM)
3955 fixed_regs[j] = call_used_regs[j] = 1;
3957 /* For 32-bit targets, squash the REX registers. */
3958 if (! TARGET_64BIT)
3960 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
3961 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3962 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
3963 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3966 /* See the definition of CALL_USED_REGISTERS in i386.h. */
3967 c_mask = (TARGET_64BIT_MS_ABI ? (1 << 3)
3968 : TARGET_64BIT ? (1 << 2)
3969 : (1 << 1));
3971 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
3973 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3975 /* Set/reset conditionally defined registers from
3976 CALL_USED_REGISTERS initializer. */
3977 if (call_used_regs[i] > 1)
3978 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
3980 /* Calculate registers of CLOBBERED_REGS register set
3981 as call used registers from GENERAL_REGS register set. */
3982 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
3983 && call_used_regs[i])
3984 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
3987 /* If MMX is disabled, squash the registers. */
3988 if (! TARGET_MMX)
3989 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3990 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
3991 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3993 /* If SSE is disabled, squash the registers. */
3994 if (! TARGET_SSE)
3995 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3996 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
3997 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3999 /* If the FPU is disabled, squash the registers. */
4000 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
4001 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4002 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
4003 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4007 /* Save the current options */
4009 static void
4010 ix86_function_specific_save (struct cl_target_option *ptr)
4012 ptr->arch = ix86_arch;
4013 ptr->schedule = ix86_schedule;
4014 ptr->tune = ix86_tune;
4015 ptr->branch_cost = ix86_branch_cost;
4016 ptr->tune_defaulted = ix86_tune_defaulted;
4017 ptr->arch_specified = ix86_arch_specified;
4018 ptr->x_ix86_isa_flags_explicit = ix86_isa_flags_explicit;
4019 ptr->ix86_target_flags_explicit = target_flags_explicit;
4020 ptr->x_recip_mask_explicit = recip_mask_explicit;
4022 /* The fields are char but the variables are not; make sure the
4023 values fit in the fields. */
4024 gcc_assert (ptr->arch == ix86_arch);
4025 gcc_assert (ptr->schedule == ix86_schedule);
4026 gcc_assert (ptr->tune == ix86_tune);
4027 gcc_assert (ptr->branch_cost == ix86_branch_cost);
4030 /* Restore the current options */
4032 static void
4033 ix86_function_specific_restore (struct cl_target_option *ptr)
4035 enum processor_type old_tune = ix86_tune;
4036 enum processor_type old_arch = ix86_arch;
4037 unsigned int ix86_arch_mask, ix86_tune_mask;
4038 int i;
4040 ix86_arch = (enum processor_type) ptr->arch;
4041 ix86_schedule = (enum attr_cpu) ptr->schedule;
4042 ix86_tune = (enum processor_type) ptr->tune;
4043 ix86_branch_cost = ptr->branch_cost;
4044 ix86_tune_defaulted = ptr->tune_defaulted;
4045 ix86_arch_specified = ptr->arch_specified;
4046 ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
4047 target_flags_explicit = ptr->ix86_target_flags_explicit;
4048 recip_mask_explicit = ptr->x_recip_mask_explicit;
4050 /* Recreate the arch feature tests if the arch changed */
4051 if (old_arch != ix86_arch)
4053 ix86_arch_mask = 1u << ix86_arch;
4054 for (i = 0; i < X86_ARCH_LAST; ++i)
4055 ix86_arch_features[i]
4056 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4059 /* Recreate the tune optimization tests */
4060 if (old_tune != ix86_tune)
4062 ix86_tune_mask = 1u << ix86_tune;
4063 for (i = 0; i < X86_TUNE_LAST; ++i)
4064 ix86_tune_features[i]
4065 = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
4069 /* Print the current options */
4071 static void
4072 ix86_function_specific_print (FILE *file, int indent,
4073 struct cl_target_option *ptr)
4075 char *target_string
4076 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
4077 NULL, NULL, ptr->x_ix86_fpmath, false);
4079 fprintf (file, "%*sarch = %d (%s)\n",
4080 indent, "",
4081 ptr->arch,
4082 ((ptr->arch < TARGET_CPU_DEFAULT_max)
4083 ? cpu_names[ptr->arch]
4084 : "<unknown>"));
4086 fprintf (file, "%*stune = %d (%s)\n",
4087 indent, "",
4088 ptr->tune,
4089 ((ptr->tune < TARGET_CPU_DEFAULT_max)
4090 ? cpu_names[ptr->tune]
4091 : "<unknown>"));
4093 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
4095 if (target_string)
4097 fprintf (file, "%*s%s\n", indent, "", target_string);
4098 free (target_string);
4103 /* Inner function to process the attribute((target(...))), take an argument and
4104 set the current options from the argument. If we have a list, recursively go
4105 over the list. */
4107 static bool
4108 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4109 struct gcc_options *enum_opts_set)
4111 char *next_optstr;
4112 bool ret = true;
4114 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4115 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4116 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4117 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4118 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
4120 enum ix86_opt_type
4122 ix86_opt_unknown,
4123 ix86_opt_yes,
4124 ix86_opt_no,
4125 ix86_opt_str,
4126 ix86_opt_enum,
4127 ix86_opt_isa
4130 static const struct
4132 const char *string;
4133 size_t len;
4134 enum ix86_opt_type type;
4135 int opt;
4136 int mask;
4137 } attrs[] = {
4138 /* isa options */
4139 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
4140 IX86_ATTR_ISA ("abm", OPT_mabm),
4141 IX86_ATTR_ISA ("bmi", OPT_mbmi),
4142 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
4143 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
4144 IX86_ATTR_ISA ("tbm", OPT_mtbm),
4145 IX86_ATTR_ISA ("aes", OPT_maes),
4146 IX86_ATTR_ISA ("avx", OPT_mavx),
4147 IX86_ATTR_ISA ("avx2", OPT_mavx2),
4148 IX86_ATTR_ISA ("mmx", OPT_mmmx),
4149 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
4150 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
4151 IX86_ATTR_ISA ("sse", OPT_msse),
4152 IX86_ATTR_ISA ("sse2", OPT_msse2),
4153 IX86_ATTR_ISA ("sse3", OPT_msse3),
4154 IX86_ATTR_ISA ("sse4", OPT_msse4),
4155 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
4156 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
4157 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
4158 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
4159 IX86_ATTR_ISA ("fma4", OPT_mfma4),
4160 IX86_ATTR_ISA ("fma", OPT_mfma),
4161 IX86_ATTR_ISA ("xop", OPT_mxop),
4162 IX86_ATTR_ISA ("lwp", OPT_mlwp),
4163 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
4164 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
4165 IX86_ATTR_ISA ("f16c", OPT_mf16c),
4166 IX86_ATTR_ISA ("rtm", OPT_mrtm),
4167 IX86_ATTR_ISA ("hle", OPT_mhle),
4168 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
4169 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
4170 IX86_ATTR_ISA ("adx", OPT_madx),
4171 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
4172 IX86_ATTR_ISA ("xsave", OPT_mxsave),
4173 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
4175 /* enum options */
4176 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
4178 /* string options */
4179 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
4180 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
4182 /* flag options */
4183 IX86_ATTR_YES ("cld",
4184 OPT_mcld,
4185 MASK_CLD),
4187 IX86_ATTR_NO ("fancy-math-387",
4188 OPT_mfancy_math_387,
4189 MASK_NO_FANCY_MATH_387),
4191 IX86_ATTR_YES ("ieee-fp",
4192 OPT_mieee_fp,
4193 MASK_IEEE_FP),
4195 IX86_ATTR_YES ("inline-all-stringops",
4196 OPT_minline_all_stringops,
4197 MASK_INLINE_ALL_STRINGOPS),
4199 IX86_ATTR_YES ("inline-stringops-dynamically",
4200 OPT_minline_stringops_dynamically,
4201 MASK_INLINE_STRINGOPS_DYNAMICALLY),
4203 IX86_ATTR_NO ("align-stringops",
4204 OPT_mno_align_stringops,
4205 MASK_NO_ALIGN_STRINGOPS),
4207 IX86_ATTR_YES ("recip",
4208 OPT_mrecip,
4209 MASK_RECIP),
4213 /* If this is a list, recurse to get the options. */
4214 if (TREE_CODE (args) == TREE_LIST)
4216 bool ret = true;
4218 for (; args; args = TREE_CHAIN (args))
4219 if (TREE_VALUE (args)
4220 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4221 p_strings, enum_opts_set))
4222 ret = false;
4224 return ret;
4227 else if (TREE_CODE (args) != STRING_CST)
4229 error ("attribute %<target%> argument not a string");
4230 return false;
4233 /* Handle multiple arguments separated by commas. */
4234 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4236 while (next_optstr && *next_optstr != '\0')
4238 char *p = next_optstr;
4239 char *orig_p = p;
4240 char *comma = strchr (next_optstr, ',');
4241 const char *opt_string;
4242 size_t len, opt_len;
4243 int opt;
4244 bool opt_set_p;
4245 char ch;
4246 unsigned i;
4247 enum ix86_opt_type type = ix86_opt_unknown;
4248 int mask = 0;
4250 if (comma)
4252 *comma = '\0';
4253 len = comma - next_optstr;
4254 next_optstr = comma + 1;
4256 else
4258 len = strlen (p);
4259 next_optstr = NULL;
4262 /* Recognize no-xxx. */
4263 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4265 opt_set_p = false;
4266 p += 3;
4267 len -= 3;
4269 else
4270 opt_set_p = true;
4272 /* Find the option. */
4273 ch = *p;
4274 opt = N_OPTS;
4275 for (i = 0; i < ARRAY_SIZE (attrs); i++)
4277 type = attrs[i].type;
4278 opt_len = attrs[i].len;
4279 if (ch == attrs[i].string[0]
4280 && ((type != ix86_opt_str && type != ix86_opt_enum)
4281 ? len == opt_len
4282 : len > opt_len)
4283 && memcmp (p, attrs[i].string, opt_len) == 0)
4285 opt = attrs[i].opt;
4286 mask = attrs[i].mask;
4287 opt_string = attrs[i].string;
4288 break;
4292 /* Process the option. */
4293 if (opt == N_OPTS)
4295 error ("attribute(target(\"%s\")) is unknown", orig_p);
4296 ret = false;
4299 else if (type == ix86_opt_isa)
4301 struct cl_decoded_option decoded;
4303 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4304 ix86_handle_option (&global_options, &global_options_set,
4305 &decoded, input_location);
4308 else if (type == ix86_opt_yes || type == ix86_opt_no)
4310 if (type == ix86_opt_no)
4311 opt_set_p = !opt_set_p;
4313 if (opt_set_p)
4314 target_flags |= mask;
4315 else
4316 target_flags &= ~mask;
4319 else if (type == ix86_opt_str)
4321 if (p_strings[opt])
4323 error ("option(\"%s\") was already specified", opt_string);
4324 ret = false;
4326 else
4327 p_strings[opt] = xstrdup (p + opt_len);
4330 else if (type == ix86_opt_enum)
4332 bool arg_ok;
4333 int value;
4335 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4336 if (arg_ok)
4337 set_option (&global_options, enum_opts_set, opt, value,
4338 p + opt_len, DK_UNSPECIFIED, input_location,
4339 global_dc);
4340 else
4342 error ("attribute(target(\"%s\")) is unknown", orig_p);
4343 ret = false;
4347 else
4348 gcc_unreachable ();
4351 return ret;
4354 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
4356 tree
4357 ix86_valid_target_attribute_tree (tree args)
4359 const char *orig_arch_string = ix86_arch_string;
4360 const char *orig_tune_string = ix86_tune_string;
4361 enum fpmath_unit orig_fpmath_set = global_options_set.x_ix86_fpmath;
4362 int orig_tune_defaulted = ix86_tune_defaulted;
4363 int orig_arch_specified = ix86_arch_specified;
4364 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4365 tree t = NULL_TREE;
4366 int i;
4367 struct cl_target_option *def
4368 = TREE_TARGET_OPTION (target_option_default_node);
4369 struct gcc_options enum_opts_set;
4371 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4373 /* Process each of the options on the chain. */
4374 if (! ix86_valid_target_attribute_inner_p (args, option_strings,
4375 &enum_opts_set))
4376 return error_mark_node;
4378 /* If the changed options are different from the default, rerun
4379 ix86_option_override_internal, and then save the options away.
4380 The string options are are attribute options, and will be undone
4381 when we copy the save structure. */
4382 if (ix86_isa_flags != def->x_ix86_isa_flags
4383 || target_flags != def->x_target_flags
4384 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4385 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4386 || enum_opts_set.x_ix86_fpmath)
4388 /* If we are using the default tune= or arch=, undo the string assigned,
4389 and use the default. */
4390 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4391 ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4392 else if (!orig_arch_specified)
4393 ix86_arch_string = NULL;
4395 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4396 ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4397 else if (orig_tune_defaulted)
4398 ix86_tune_string = NULL;
4400 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
4401 if (enum_opts_set.x_ix86_fpmath)
4402 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4403 else if (!TARGET_64BIT && TARGET_SSE)
4405 ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4406 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4409 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
4410 ix86_option_override_internal (false);
4412 /* Add any builtin functions with the new isa if any. */
4413 ix86_add_new_builtins (ix86_isa_flags);
4415 /* Save the current options unless we are validating options for
4416 #pragma. */
4417 t = build_target_option_node ();
4419 ix86_arch_string = orig_arch_string;
4420 ix86_tune_string = orig_tune_string;
4421 global_options_set.x_ix86_fpmath = orig_fpmath_set;
4423 /* Free up memory allocated to hold the strings */
4424 for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4425 free (option_strings[i]);
4428 return t;
4431 /* Hook to validate attribute((target("string"))). */
4433 static bool
4434 ix86_valid_target_attribute_p (tree fndecl,
4435 tree ARG_UNUSED (name),
4436 tree args,
4437 int ARG_UNUSED (flags))
4439 struct cl_target_option cur_target;
4440 bool ret = true;
4442 /* attribute((target("default"))) does nothing, beyond
4443 affecting multi-versioning. */
4444 if (TREE_VALUE (args)
4445 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
4446 && TREE_CHAIN (args) == NULL_TREE
4447 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
4448 return true;
4450 tree old_optimize = build_optimization_node ();
4451 tree new_target, new_optimize;
4452 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4454 /* If the function changed the optimization levels as well as setting target
4455 options, start with the optimizations specified. */
4456 if (func_optimize && func_optimize != old_optimize)
4457 cl_optimization_restore (&global_options,
4458 TREE_OPTIMIZATION (func_optimize));
4460 /* The target attributes may also change some optimization flags, so update
4461 the optimization options if necessary. */
4462 cl_target_option_save (&cur_target, &global_options);
4463 new_target = ix86_valid_target_attribute_tree (args);
4464 new_optimize = build_optimization_node ();
4466 if (new_target == error_mark_node)
4467 ret = false;
4469 else if (fndecl && new_target)
4471 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4473 if (old_optimize != new_optimize)
4474 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4477 cl_target_option_restore (&global_options, &cur_target);
4479 if (old_optimize != new_optimize)
4480 cl_optimization_restore (&global_options,
4481 TREE_OPTIMIZATION (old_optimize));
4483 return ret;
4487 /* Hook to determine if one function can safely inline another. */
4489 static bool
4490 ix86_can_inline_p (tree caller, tree callee)
4492 bool ret = false;
4493 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4494 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4496 /* If callee has no option attributes, then it is ok to inline. */
4497 if (!callee_tree)
4498 ret = true;
4500 /* If caller has no option attributes, but callee does then it is not ok to
4501 inline. */
4502 else if (!caller_tree)
4503 ret = false;
4505 else
4507 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4508 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4510 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4511 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4512 function. */
4513 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4514 != callee_opts->x_ix86_isa_flags)
4515 ret = false;
4517 /* See if we have the same non-isa options. */
4518 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4519 ret = false;
4521 /* See if arch, tune, etc. are the same. */
4522 else if (caller_opts->arch != callee_opts->arch)
4523 ret = false;
4525 else if (caller_opts->tune != callee_opts->tune)
4526 ret = false;
4528 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4529 ret = false;
4531 else if (caller_opts->branch_cost != callee_opts->branch_cost)
4532 ret = false;
4534 else
4535 ret = true;
4538 return ret;
4542 /* Remember the last target of ix86_set_current_function. */
4543 static GTY(()) tree ix86_previous_fndecl;
4545 /* Establish appropriate back-end context for processing the function
4546 FNDECL. The argument might be NULL to indicate processing at top
4547 level, outside of any function scope. */
4548 static void
4549 ix86_set_current_function (tree fndecl)
4551 /* Only change the context if the function changes. This hook is called
4552 several times in the course of compiling a function, and we don't want to
4553 slow things down too much or call target_reinit when it isn't safe. */
4554 if (fndecl && fndecl != ix86_previous_fndecl)
4556 tree old_tree = (ix86_previous_fndecl
4557 ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4558 : NULL_TREE);
4560 tree new_tree = (fndecl
4561 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4562 : NULL_TREE);
4564 ix86_previous_fndecl = fndecl;
4565 if (old_tree == new_tree)
4568 else if (new_tree)
4570 cl_target_option_restore (&global_options,
4571 TREE_TARGET_OPTION (new_tree));
4572 target_reinit ();
4575 else if (old_tree)
4577 struct cl_target_option *def
4578 = TREE_TARGET_OPTION (target_option_current_node);
4580 cl_target_option_restore (&global_options, def);
4581 target_reinit ();
4587 /* Return true if this goes in large data/bss. */
4589 static bool
4590 ix86_in_large_data_p (tree exp)
4592 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4593 return false;
4595 /* Functions are never large data. */
4596 if (TREE_CODE (exp) == FUNCTION_DECL)
4597 return false;
4599 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4601 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4602 if (strcmp (section, ".ldata") == 0
4603 || strcmp (section, ".lbss") == 0)
4604 return true;
4605 return false;
4607 else
4609 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4611 /* If this is an incomplete type with size 0, then we can't put it
4612 in data because it might be too big when completed. */
4613 if (!size || size > ix86_section_threshold)
4614 return true;
4617 return false;
4620 /* Switch to the appropriate section for output of DECL.
4621 DECL is either a `VAR_DECL' node or a constant of some sort.
4622 RELOC indicates whether forming the initial value of DECL requires
4623 link-time relocations. */
4625 static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT)
4626 ATTRIBUTE_UNUSED;
4628 static section *
4629 x86_64_elf_select_section (tree decl, int reloc,
4630 unsigned HOST_WIDE_INT align)
4632 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4633 && ix86_in_large_data_p (decl))
4635 const char *sname = NULL;
4636 unsigned int flags = SECTION_WRITE;
4637 switch (categorize_decl_for_section (decl, reloc))
4639 case SECCAT_DATA:
4640 sname = ".ldata";
4641 break;
4642 case SECCAT_DATA_REL:
4643 sname = ".ldata.rel";
4644 break;
4645 case SECCAT_DATA_REL_LOCAL:
4646 sname = ".ldata.rel.local";
4647 break;
4648 case SECCAT_DATA_REL_RO:
4649 sname = ".ldata.rel.ro";
4650 break;
4651 case SECCAT_DATA_REL_RO_LOCAL:
4652 sname = ".ldata.rel.ro.local";
4653 break;
4654 case SECCAT_BSS:
4655 sname = ".lbss";
4656 flags |= SECTION_BSS;
4657 break;
4658 case SECCAT_RODATA:
4659 case SECCAT_RODATA_MERGE_STR:
4660 case SECCAT_RODATA_MERGE_STR_INIT:
4661 case SECCAT_RODATA_MERGE_CONST:
4662 sname = ".lrodata";
4663 flags = 0;
4664 break;
4665 case SECCAT_SRODATA:
4666 case SECCAT_SDATA:
4667 case SECCAT_SBSS:
4668 gcc_unreachable ();
4669 case SECCAT_TEXT:
4670 case SECCAT_TDATA:
4671 case SECCAT_TBSS:
4672 /* We don't split these for medium model. Place them into
4673 default sections and hope for best. */
4674 break;
4676 if (sname)
4678 /* We might get called with string constants, but get_named_section
4679 doesn't like them as they are not DECLs. Also, we need to set
4680 flags in that case. */
4681 if (!DECL_P (decl))
4682 return get_section (sname, flags, NULL);
4683 return get_named_section (decl, sname, reloc);
4686 return default_elf_select_section (decl, reloc, align);
4689 /* Build up a unique section name, expressed as a
4690 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
4691 RELOC indicates whether the initial value of EXP requires
4692 link-time relocations. */
4694 static void ATTRIBUTE_UNUSED
4695 x86_64_elf_unique_section (tree decl, int reloc)
4697 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4698 && ix86_in_large_data_p (decl))
4700 const char *prefix = NULL;
4701 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
4702 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
4704 switch (categorize_decl_for_section (decl, reloc))
4706 case SECCAT_DATA:
4707 case SECCAT_DATA_REL:
4708 case SECCAT_DATA_REL_LOCAL:
4709 case SECCAT_DATA_REL_RO:
4710 case SECCAT_DATA_REL_RO_LOCAL:
4711 prefix = one_only ? ".ld" : ".ldata";
4712 break;
4713 case SECCAT_BSS:
4714 prefix = one_only ? ".lb" : ".lbss";
4715 break;
4716 case SECCAT_RODATA:
4717 case SECCAT_RODATA_MERGE_STR:
4718 case SECCAT_RODATA_MERGE_STR_INIT:
4719 case SECCAT_RODATA_MERGE_CONST:
4720 prefix = one_only ? ".lr" : ".lrodata";
4721 break;
4722 case SECCAT_SRODATA:
4723 case SECCAT_SDATA:
4724 case SECCAT_SBSS:
4725 gcc_unreachable ();
4726 case SECCAT_TEXT:
4727 case SECCAT_TDATA:
4728 case SECCAT_TBSS:
4729 /* We don't split these for medium model. Place them into
4730 default sections and hope for best. */
4731 break;
4733 if (prefix)
4735 const char *name, *linkonce;
4736 char *string;
4738 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
4739 name = targetm.strip_name_encoding (name);
4741 /* If we're using one_only, then there needs to be a .gnu.linkonce
4742 prefix to the section name. */
4743 linkonce = one_only ? ".gnu.linkonce" : "";
4745 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
4747 DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
4748 return;
4751 default_unique_section (decl, reloc);
4754 #ifdef COMMON_ASM_OP
4755 /* This says how to output assembler code to declare an
4756 uninitialized external linkage data object.
4758 For medium model x86-64 we need to use .largecomm opcode for
4759 large objects. */
4760 void
4761 x86_elf_aligned_common (FILE *file,
4762 const char *name, unsigned HOST_WIDE_INT size,
4763 int align)
4765 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4766 && size > (unsigned int)ix86_section_threshold)
4767 fputs (".largecomm\t", file);
4768 else
4769 fputs (COMMON_ASM_OP, file);
4770 assemble_name (file, name);
4771 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
4772 size, align / BITS_PER_UNIT);
4774 #endif
4776 /* Utility function for targets to use in implementing
4777 ASM_OUTPUT_ALIGNED_BSS. */
4779 void
4780 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
4781 const char *name, unsigned HOST_WIDE_INT size,
4782 int align)
4784 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4785 && size > (unsigned int)ix86_section_threshold)
4786 switch_to_section (get_named_section (decl, ".lbss", 0));
4787 else
4788 switch_to_section (bss_section);
4789 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
4790 #ifdef ASM_DECLARE_OBJECT_NAME
4791 last_assemble_variable_decl = decl;
4792 ASM_DECLARE_OBJECT_NAME (file, name, decl);
4793 #else
4794 /* Standard thing is just output label for the object. */
4795 ASM_OUTPUT_LABEL (file, name);
4796 #endif /* ASM_DECLARE_OBJECT_NAME */
4797 ASM_OUTPUT_SKIP (file, size ? size : 1);
4800 /* Decide whether we must probe the stack before any space allocation
4801 on this target. It's essentially TARGET_STACK_PROBE except when
4802 -fstack-check causes the stack to be already probed differently. */
4804 bool
4805 ix86_target_stack_probe (void)
4807 /* Do not probe the stack twice if static stack checking is enabled. */
4808 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4809 return false;
4811 return TARGET_STACK_PROBE;
4814 /* Decide whether we can make a sibling call to a function. DECL is the
4815 declaration of the function being targeted by the call and EXP is the
4816 CALL_EXPR representing the call. */
4818 static bool
4819 ix86_function_ok_for_sibcall (tree decl, tree exp)
4821 tree type, decl_or_type;
4822 rtx a, b;
4824 /* If we are generating position-independent code, we cannot sibcall
4825 optimize any indirect call, or a direct call to a global function,
4826 as the PLT requires %ebx be live. (Darwin does not have a PLT.) */
4827 if (!TARGET_MACHO
4828 && !TARGET_64BIT
4829 && flag_pic
4830 && (!decl || !targetm.binds_local_p (decl)))
4831 return false;
4833 /* If we need to align the outgoing stack, then sibcalling would
4834 unalign the stack, which may break the called function. */
4835 if (ix86_minimum_incoming_stack_boundary (true)
4836 < PREFERRED_STACK_BOUNDARY)
4837 return false;
4839 if (decl)
4841 decl_or_type = decl;
4842 type = TREE_TYPE (decl);
4844 else
4846 /* We're looking at the CALL_EXPR, we need the type of the function. */
4847 type = CALL_EXPR_FN (exp); /* pointer expression */
4848 type = TREE_TYPE (type); /* pointer type */
4849 type = TREE_TYPE (type); /* function type */
4850 decl_or_type = type;
4853 /* Check that the return value locations are the same. Like
4854 if we are returning floats on the 80387 register stack, we cannot
4855 make a sibcall from a function that doesn't return a float to a
4856 function that does or, conversely, from a function that does return
4857 a float to a function that doesn't; the necessary stack adjustment
4858 would not be executed. This is also the place we notice
4859 differences in the return value ABI. Note that it is ok for one
4860 of the functions to have void return type as long as the return
4861 value of the other is passed in a register. */
4862 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
4863 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
4864 cfun->decl, false);
4865 if (STACK_REG_P (a) || STACK_REG_P (b))
4867 if (!rtx_equal_p (a, b))
4868 return false;
4870 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
4872 else if (!rtx_equal_p (a, b))
4873 return false;
4875 if (TARGET_64BIT)
4877 /* The SYSV ABI has more call-clobbered registers;
4878 disallow sibcalls from MS to SYSV. */
4879 if (cfun->machine->call_abi == MS_ABI
4880 && ix86_function_type_abi (type) == SYSV_ABI)
4881 return false;
4883 else
4885 /* If this call is indirect, we'll need to be able to use a
4886 call-clobbered register for the address of the target function.
4887 Make sure that all such registers are not used for passing
4888 parameters. Note that DLLIMPORT functions are indirect. */
4889 if (!decl
4890 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
4892 if (ix86_function_regparm (type, NULL) >= 3)
4894 /* ??? Need to count the actual number of registers to be used,
4895 not the possible number of registers. Fix later. */
4896 return false;
4901 /* Otherwise okay. That also includes certain types of indirect calls. */
4902 return true;
4905 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
4906 and "sseregparm" calling convention attributes;
4907 arguments as in struct attribute_spec.handler. */
4909 static tree
4910 ix86_handle_cconv_attribute (tree *node, tree name,
4911 tree args,
4912 int flags ATTRIBUTE_UNUSED,
4913 bool *no_add_attrs)
4915 if (TREE_CODE (*node) != FUNCTION_TYPE
4916 && TREE_CODE (*node) != METHOD_TYPE
4917 && TREE_CODE (*node) != FIELD_DECL
4918 && TREE_CODE (*node) != TYPE_DECL)
4920 warning (OPT_Wattributes, "%qE attribute only applies to functions",
4921 name);
4922 *no_add_attrs = true;
4923 return NULL_TREE;
4926 /* Can combine regparm with all attributes but fastcall, and thiscall. */
4927 if (is_attribute_p ("regparm", name))
4929 tree cst;
4931 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4933 error ("fastcall and regparm attributes are not compatible");
4936 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4938 error ("regparam and thiscall attributes are not compatible");
4941 cst = TREE_VALUE (args);
4942 if (TREE_CODE (cst) != INTEGER_CST)
4944 warning (OPT_Wattributes,
4945 "%qE attribute requires an integer constant argument",
4946 name);
4947 *no_add_attrs = true;
4949 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
4951 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
4952 name, REGPARM_MAX);
4953 *no_add_attrs = true;
4956 return NULL_TREE;
4959 if (TARGET_64BIT)
4961 /* Do not warn when emulating the MS ABI. */
4962 if ((TREE_CODE (*node) != FUNCTION_TYPE
4963 && TREE_CODE (*node) != METHOD_TYPE)
4964 || ix86_function_type_abi (*node) != MS_ABI)
4965 warning (OPT_Wattributes, "%qE attribute ignored",
4966 name);
4967 *no_add_attrs = true;
4968 return NULL_TREE;
4971 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
4972 if (is_attribute_p ("fastcall", name))
4974 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4976 error ("fastcall and cdecl attributes are not compatible");
4978 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
4980 error ("fastcall and stdcall attributes are not compatible");
4982 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
4984 error ("fastcall and regparm attributes are not compatible");
4986 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4988 error ("fastcall and thiscall attributes are not compatible");
4992 /* Can combine stdcall with fastcall (redundant), regparm and
4993 sseregparm. */
4994 else if (is_attribute_p ("stdcall", name))
4996 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4998 error ("stdcall and cdecl attributes are not compatible");
5000 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5002 error ("stdcall and fastcall attributes are not compatible");
5004 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5006 error ("stdcall and thiscall attributes are not compatible");
5010 /* Can combine cdecl with regparm and sseregparm. */
5011 else if (is_attribute_p ("cdecl", name))
5013 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5015 error ("stdcall and cdecl attributes are not compatible");
5017 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5019 error ("fastcall and cdecl attributes are not compatible");
5021 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5023 error ("cdecl and thiscall attributes are not compatible");
5026 else if (is_attribute_p ("thiscall", name))
5028 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
5029 warning (OPT_Wattributes, "%qE attribute is used for none class-method",
5030 name);
5031 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5033 error ("stdcall and thiscall attributes are not compatible");
5035 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5037 error ("fastcall and thiscall attributes are not compatible");
5039 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5041 error ("cdecl and thiscall attributes are not compatible");
5045 /* Can combine sseregparm with all attributes. */
5047 return NULL_TREE;
5050 /* The transactional memory builtins are implicitly regparm or fastcall
5051 depending on the ABI. Override the generic do-nothing attribute that
5052 these builtins were declared with, and replace it with one of the two
5053 attributes that we expect elsewhere. */
5055 static tree
5056 ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
5057 tree args ATTRIBUTE_UNUSED,
5058 int flags ATTRIBUTE_UNUSED,
5059 bool *no_add_attrs)
5061 tree alt;
5063 /* In no case do we want to add the placeholder attribute. */
5064 *no_add_attrs = true;
5066 /* The 64-bit ABI is unchanged for transactional memory. */
5067 if (TARGET_64BIT)
5068 return NULL_TREE;
5070 /* ??? Is there a better way to validate 32-bit windows? We have
5071 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
5072 if (CHECK_STACK_LIMIT > 0)
5073 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
5074 else
5076 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
5077 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
5079 decl_attributes (node, alt, flags);
5081 return NULL_TREE;
5084 /* This function determines from TYPE the calling-convention. */
5086 unsigned int
5087 ix86_get_callcvt (const_tree type)
5089 unsigned int ret = 0;
5090 bool is_stdarg;
5091 tree attrs;
5093 if (TARGET_64BIT)
5094 return IX86_CALLCVT_CDECL;
5096 attrs = TYPE_ATTRIBUTES (type);
5097 if (attrs != NULL_TREE)
5099 if (lookup_attribute ("cdecl", attrs))
5100 ret |= IX86_CALLCVT_CDECL;
5101 else if (lookup_attribute ("stdcall", attrs))
5102 ret |= IX86_CALLCVT_STDCALL;
5103 else if (lookup_attribute ("fastcall", attrs))
5104 ret |= IX86_CALLCVT_FASTCALL;
5105 else if (lookup_attribute ("thiscall", attrs))
5106 ret |= IX86_CALLCVT_THISCALL;
5108 /* Regparam isn't allowed for thiscall and fastcall. */
5109 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
5111 if (lookup_attribute ("regparm", attrs))
5112 ret |= IX86_CALLCVT_REGPARM;
5113 if (lookup_attribute ("sseregparm", attrs))
5114 ret |= IX86_CALLCVT_SSEREGPARM;
5117 if (IX86_BASE_CALLCVT(ret) != 0)
5118 return ret;
5121 is_stdarg = stdarg_p (type);
5122 if (TARGET_RTD && !is_stdarg)
5123 return IX86_CALLCVT_STDCALL | ret;
5125 if (ret != 0
5126 || is_stdarg
5127 || TREE_CODE (type) != METHOD_TYPE
5128 || ix86_function_type_abi (type) != MS_ABI)
5129 return IX86_CALLCVT_CDECL | ret;
5131 return IX86_CALLCVT_THISCALL;
5134 /* Return 0 if the attributes for two types are incompatible, 1 if they
5135 are compatible, and 2 if they are nearly compatible (which causes a
5136 warning to be generated). */
5138 static int
5139 ix86_comp_type_attributes (const_tree type1, const_tree type2)
5141 unsigned int ccvt1, ccvt2;
5143 if (TREE_CODE (type1) != FUNCTION_TYPE
5144 && TREE_CODE (type1) != METHOD_TYPE)
5145 return 1;
5147 ccvt1 = ix86_get_callcvt (type1);
5148 ccvt2 = ix86_get_callcvt (type2);
5149 if (ccvt1 != ccvt2)
5150 return 0;
5151 if (ix86_function_regparm (type1, NULL)
5152 != ix86_function_regparm (type2, NULL))
5153 return 0;
5155 return 1;
5158 /* Return the regparm value for a function with the indicated TYPE and DECL.
5159 DECL may be NULL when calling function indirectly
5160 or considering a libcall. */
5162 static int
5163 ix86_function_regparm (const_tree type, const_tree decl)
5165 tree attr;
5166 int regparm;
5167 unsigned int ccvt;
5169 if (TARGET_64BIT)
5170 return (ix86_function_type_abi (type) == SYSV_ABI
5171 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5172 ccvt = ix86_get_callcvt (type);
5173 regparm = ix86_regparm;
5175 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5177 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5178 if (attr)
5180 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
5181 return regparm;
5184 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5185 return 2;
5186 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5187 return 1;
5189 /* Use register calling convention for local functions when possible. */
5190 if (decl
5191 && TREE_CODE (decl) == FUNCTION_DECL
5192 && optimize
5193 && !(profile_flag && !flag_fentry))
5195 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5196 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5197 if (i && i->local && i->can_change_signature)
5199 int local_regparm, globals = 0, regno;
5201 /* Make sure no regparm register is taken by a
5202 fixed register variable. */
5203 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5204 if (fixed_regs[local_regparm])
5205 break;
5207 /* We don't want to use regparm(3) for nested functions as
5208 these use a static chain pointer in the third argument. */
5209 if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5210 local_regparm = 2;
5212 /* In 32-bit mode save a register for the split stack. */
5213 if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5214 local_regparm = 2;
5216 /* Each fixed register usage increases register pressure,
5217 so less registers should be used for argument passing.
5218 This functionality can be overriden by an explicit
5219 regparm value. */
5220 for (regno = AX_REG; regno <= DI_REG; regno++)
5221 if (fixed_regs[regno])
5222 globals++;
5224 local_regparm
5225 = globals < local_regparm ? local_regparm - globals : 0;
5227 if (local_regparm > regparm)
5228 regparm = local_regparm;
5232 return regparm;
5235 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5236 DFmode (2) arguments in SSE registers for a function with the
5237 indicated TYPE and DECL. DECL may be NULL when calling function
5238 indirectly or considering a libcall. Otherwise return 0. */
5240 static int
5241 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5243 gcc_assert (!TARGET_64BIT);
5245 /* Use SSE registers to pass SFmode and DFmode arguments if requested
5246 by the sseregparm attribute. */
5247 if (TARGET_SSEREGPARM
5248 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5250 if (!TARGET_SSE)
5252 if (warn)
5254 if (decl)
5255 error ("calling %qD with attribute sseregparm without "
5256 "SSE/SSE2 enabled", decl);
5257 else
5258 error ("calling %qT with attribute sseregparm without "
5259 "SSE/SSE2 enabled", type);
5261 return 0;
5264 return 2;
5267 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5268 (and DFmode for SSE2) arguments in SSE registers. */
5269 if (decl && TARGET_SSE_MATH && optimize
5270 && !(profile_flag && !flag_fentry))
5272 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5273 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5274 if (i && i->local && i->can_change_signature)
5275 return TARGET_SSE2 ? 2 : 1;
5278 return 0;
5281 /* Return true if EAX is live at the start of the function. Used by
5282 ix86_expand_prologue to determine if we need special help before
5283 calling allocate_stack_worker. */
5285 static bool
5286 ix86_eax_live_at_start_p (void)
5288 /* Cheat. Don't bother working forward from ix86_function_regparm
5289 to the function type to whether an actual argument is located in
5290 eax. Instead just look at cfg info, which is still close enough
5291 to correct at this point. This gives false positives for broken
5292 functions that might use uninitialized data that happens to be
5293 allocated in eax, but who cares? */
5294 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
5297 static bool
5298 ix86_keep_aggregate_return_pointer (tree fntype)
5300 tree attr;
5302 if (!TARGET_64BIT)
5304 attr = lookup_attribute ("callee_pop_aggregate_return",
5305 TYPE_ATTRIBUTES (fntype));
5306 if (attr)
5307 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5309 /* For 32-bit MS-ABI the default is to keep aggregate
5310 return pointer. */
5311 if (ix86_function_type_abi (fntype) == MS_ABI)
5312 return true;
5314 return KEEP_AGGREGATE_RETURN_POINTER != 0;
5317 /* Value is the number of bytes of arguments automatically
5318 popped when returning from a subroutine call.
5319 FUNDECL is the declaration node of the function (as a tree),
5320 FUNTYPE is the data type of the function (as a tree),
5321 or for a library call it is an identifier node for the subroutine name.
5322 SIZE is the number of bytes of arguments passed on the stack.
5324 On the 80386, the RTD insn may be used to pop them if the number
5325 of args is fixed, but if the number is variable then the caller
5326 must pop them all. RTD can't be used for library calls now
5327 because the library is compiled with the Unix compiler.
5328 Use of RTD is a selectable option, since it is incompatible with
5329 standard Unix calling sequences. If the option is not selected,
5330 the caller must always pop the args.
5332 The attribute stdcall is equivalent to RTD on a per module basis. */
5334 static int
5335 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5337 unsigned int ccvt;
5339 /* None of the 64-bit ABIs pop arguments. */
5340 if (TARGET_64BIT)
5341 return 0;
5343 ccvt = ix86_get_callcvt (funtype);
5345 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5346 | IX86_CALLCVT_THISCALL)) != 0
5347 && ! stdarg_p (funtype))
5348 return size;
5350 /* Lose any fake structure return argument if it is passed on the stack. */
5351 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5352 && !ix86_keep_aggregate_return_pointer (funtype))
5354 int nregs = ix86_function_regparm (funtype, fundecl);
5355 if (nregs == 0)
5356 return GET_MODE_SIZE (Pmode);
5359 return 0;
5362 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
5364 static bool
5365 ix86_legitimate_combined_insn (rtx insn)
5367 /* Check operand constraints in case hard registers were propagated
5368 into insn pattern. This check prevents combine pass from
5369 generating insn patterns with invalid hard register operands.
5370 These invalid insns can eventually confuse reload to error out
5371 with a spill failure. See also PRs 46829 and 46843. */
5372 if ((INSN_CODE (insn) = recog (PATTERN (insn), insn, 0)) >= 0)
5374 int i;
5376 extract_insn (insn);
5377 preprocess_constraints ();
5379 for (i = 0; i < recog_data.n_operands; i++)
5381 rtx op = recog_data.operand[i];
5382 enum machine_mode mode = GET_MODE (op);
5383 struct operand_alternative *op_alt;
5384 int offset = 0;
5385 bool win;
5386 int j;
5388 /* A unary operator may be accepted by the predicate, but it
5389 is irrelevant for matching constraints. */
5390 if (UNARY_P (op))
5391 op = XEXP (op, 0);
5393 if (GET_CODE (op) == SUBREG)
5395 if (REG_P (SUBREG_REG (op))
5396 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
5397 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
5398 GET_MODE (SUBREG_REG (op)),
5399 SUBREG_BYTE (op),
5400 GET_MODE (op));
5401 op = SUBREG_REG (op);
5404 if (!(REG_P (op) && HARD_REGISTER_P (op)))
5405 continue;
5407 op_alt = recog_op_alt[i];
5409 /* Operand has no constraints, anything is OK. */
5410 win = !recog_data.n_alternatives;
5412 for (j = 0; j < recog_data.n_alternatives; j++)
5414 if (op_alt[j].anything_ok
5415 || (op_alt[j].matches != -1
5416 && operands_match_p
5417 (recog_data.operand[i],
5418 recog_data.operand[op_alt[j].matches]))
5419 || reg_fits_class_p (op, op_alt[j].cl, offset, mode))
5421 win = true;
5422 break;
5426 if (!win)
5427 return false;
5431 return true;
5434 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
5436 static unsigned HOST_WIDE_INT
5437 ix86_asan_shadow_offset (void)
5439 return (unsigned HOST_WIDE_INT) 1 << (TARGET_LP64 ? 44 : 29);
5442 /* Argument support functions. */
5444 /* Return true when register may be used to pass function parameters. */
5445 bool
5446 ix86_function_arg_regno_p (int regno)
5448 int i;
5449 const int *parm_regs;
5451 if (!TARGET_64BIT)
5453 if (TARGET_MACHO)
5454 return (regno < REGPARM_MAX
5455 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5456 else
5457 return (regno < REGPARM_MAX
5458 || (TARGET_MMX && MMX_REGNO_P (regno)
5459 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5460 || (TARGET_SSE && SSE_REGNO_P (regno)
5461 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5464 if (TARGET_MACHO)
5466 if (SSE_REGNO_P (regno) && TARGET_SSE)
5467 return true;
5469 else
5471 if (TARGET_SSE && SSE_REGNO_P (regno)
5472 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5473 return true;
5476 /* TODO: The function should depend on current function ABI but
5477 builtins.c would need updating then. Therefore we use the
5478 default ABI. */
5480 /* RAX is used as hidden argument to va_arg functions. */
5481 if (ix86_abi == SYSV_ABI && regno == AX_REG)
5482 return true;
5484 if (ix86_abi == MS_ABI)
5485 parm_regs = x86_64_ms_abi_int_parameter_registers;
5486 else
5487 parm_regs = x86_64_int_parameter_registers;
5488 for (i = 0; i < (ix86_abi == MS_ABI
5489 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5490 if (regno == parm_regs[i])
5491 return true;
5492 return false;
5495 /* Return if we do not know how to pass TYPE solely in registers. */
5497 static bool
5498 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5500 if (must_pass_in_stack_var_size_or_pad (mode, type))
5501 return true;
5503 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
5504 The layout_type routine is crafty and tries to trick us into passing
5505 currently unsupported vector types on the stack by using TImode. */
5506 return (!TARGET_64BIT && mode == TImode
5507 && type && TREE_CODE (type) != VECTOR_TYPE);
5510 /* It returns the size, in bytes, of the area reserved for arguments passed
5511 in registers for the function represented by fndecl dependent to the used
5512 abi format. */
5514 ix86_reg_parm_stack_space (const_tree fndecl)
5516 enum calling_abi call_abi = SYSV_ABI;
5517 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5518 call_abi = ix86_function_abi (fndecl);
5519 else
5520 call_abi = ix86_function_type_abi (fndecl);
5521 if (TARGET_64BIT && call_abi == MS_ABI)
5522 return 32;
5523 return 0;
5526 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5527 call abi used. */
5528 enum calling_abi
5529 ix86_function_type_abi (const_tree fntype)
5531 if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5533 enum calling_abi abi = ix86_abi;
5534 if (abi == SYSV_ABI)
5536 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5537 abi = MS_ABI;
5539 else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5540 abi = SYSV_ABI;
5541 return abi;
5543 return ix86_abi;
5546 static bool
5547 ix86_function_ms_hook_prologue (const_tree fn)
5549 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5551 if (decl_function_context (fn) != NULL_TREE)
5552 error_at (DECL_SOURCE_LOCATION (fn),
5553 "ms_hook_prologue is not compatible with nested function");
5554 else
5555 return true;
5557 return false;
5560 static enum calling_abi
5561 ix86_function_abi (const_tree fndecl)
5563 if (! fndecl)
5564 return ix86_abi;
5565 return ix86_function_type_abi (TREE_TYPE (fndecl));
5568 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5569 call abi used. */
5570 enum calling_abi
5571 ix86_cfun_abi (void)
5573 if (! cfun)
5574 return ix86_abi;
5575 return cfun->machine->call_abi;
5578 /* Write the extra assembler code needed to declare a function properly. */
5580 void
5581 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5582 tree decl)
5584 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5586 if (is_ms_hook)
5588 int i, filler_count = (TARGET_64BIT ? 32 : 16);
5589 unsigned int filler_cc = 0xcccccccc;
5591 for (i = 0; i < filler_count; i += 4)
5592 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5595 #ifdef SUBTARGET_ASM_UNWIND_INIT
5596 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5597 #endif
5599 ASM_OUTPUT_LABEL (asm_out_file, fname);
5601 /* Output magic byte marker, if hot-patch attribute is set. */
5602 if (is_ms_hook)
5604 if (TARGET_64BIT)
5606 /* leaq [%rsp + 0], %rsp */
5607 asm_fprintf (asm_out_file, ASM_BYTE
5608 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5610 else
5612 /* movl.s %edi, %edi
5613 push %ebp
5614 movl.s %esp, %ebp */
5615 asm_fprintf (asm_out_file, ASM_BYTE
5616 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5621 /* regclass.c */
5622 extern void init_regs (void);
5624 /* Implementation of call abi switching target hook. Specific to FNDECL
5625 the specific call register sets are set. See also
5626 ix86_conditional_register_usage for more details. */
5627 void
5628 ix86_call_abi_override (const_tree fndecl)
5630 if (fndecl == NULL_TREE)
5631 cfun->machine->call_abi = ix86_abi;
5632 else
5633 cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
5636 /* 64-bit MS and SYSV ABI have different set of call used registers. Avoid
5637 expensive re-initialization of init_regs each time we switch function context
5638 since this is needed only during RTL expansion. */
5639 static void
5640 ix86_maybe_switch_abi (void)
5642 if (TARGET_64BIT &&
5643 call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
5644 reinit_regs ();
5647 /* Initialize a variable CUM of type CUMULATIVE_ARGS
5648 for a call to a function whose data type is FNTYPE.
5649 For a library call, FNTYPE is 0. */
5651 void
5652 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
5653 tree fntype, /* tree ptr for function decl */
5654 rtx libname, /* SYMBOL_REF of library name or 0 */
5655 tree fndecl,
5656 int caller)
5658 struct cgraph_local_info *i;
5660 memset (cum, 0, sizeof (*cum));
5662 if (fndecl)
5664 i = cgraph_local_info (fndecl);
5665 cum->call_abi = ix86_function_abi (fndecl);
5667 else
5669 i = NULL;
5670 cum->call_abi = ix86_function_type_abi (fntype);
5673 cum->caller = caller;
5675 /* Set up the number of registers to use for passing arguments. */
5677 if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
5678 sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
5679 "or subtarget optimization implying it");
5680 cum->nregs = ix86_regparm;
5681 if (TARGET_64BIT)
5683 cum->nregs = (cum->call_abi == SYSV_ABI
5684 ? X86_64_REGPARM_MAX
5685 : X86_64_MS_REGPARM_MAX);
5687 if (TARGET_SSE)
5689 cum->sse_nregs = SSE_REGPARM_MAX;
5690 if (TARGET_64BIT)
5692 cum->sse_nregs = (cum->call_abi == SYSV_ABI
5693 ? X86_64_SSE_REGPARM_MAX
5694 : X86_64_MS_SSE_REGPARM_MAX);
5697 if (TARGET_MMX)
5698 cum->mmx_nregs = MMX_REGPARM_MAX;
5699 cum->warn_avx = true;
5700 cum->warn_sse = true;
5701 cum->warn_mmx = true;
5703 /* Because type might mismatch in between caller and callee, we need to
5704 use actual type of function for local calls.
5705 FIXME: cgraph_analyze can be told to actually record if function uses
5706 va_start so for local functions maybe_vaarg can be made aggressive
5707 helping K&R code.
5708 FIXME: once typesytem is fixed, we won't need this code anymore. */
5709 if (i && i->local && i->can_change_signature)
5710 fntype = TREE_TYPE (fndecl);
5711 cum->maybe_vaarg = (fntype
5712 ? (!prototype_p (fntype) || stdarg_p (fntype))
5713 : !libname);
5715 if (!TARGET_64BIT)
5717 /* If there are variable arguments, then we won't pass anything
5718 in registers in 32-bit mode. */
5719 if (stdarg_p (fntype))
5721 cum->nregs = 0;
5722 cum->sse_nregs = 0;
5723 cum->mmx_nregs = 0;
5724 cum->warn_avx = 0;
5725 cum->warn_sse = 0;
5726 cum->warn_mmx = 0;
5727 return;
5730 /* Use ecx and edx registers if function has fastcall attribute,
5731 else look for regparm information. */
5732 if (fntype)
5734 unsigned int ccvt = ix86_get_callcvt (fntype);
5735 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5737 cum->nregs = 1;
5738 cum->fastcall = 1; /* Same first register as in fastcall. */
5740 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5742 cum->nregs = 2;
5743 cum->fastcall = 1;
5745 else
5746 cum->nregs = ix86_function_regparm (fntype, fndecl);
5749 /* Set up the number of SSE registers used for passing SFmode
5750 and DFmode arguments. Warn for mismatching ABI. */
5751 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
5755 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
5756 But in the case of vector types, it is some vector mode.
5758 When we have only some of our vector isa extensions enabled, then there
5759 are some modes for which vector_mode_supported_p is false. For these
5760 modes, the generic vector support in gcc will choose some non-vector mode
5761 in order to implement the type. By computing the natural mode, we'll
5762 select the proper ABI location for the operand and not depend on whatever
5763 the middle-end decides to do with these vector types.
5765 The midde-end can't deal with the vector types > 16 bytes. In this
5766 case, we return the original mode and warn ABI change if CUM isn't
5767 NULL. */
5769 static enum machine_mode
5770 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum)
5772 enum machine_mode mode = TYPE_MODE (type);
5774 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
5776 HOST_WIDE_INT size = int_size_in_bytes (type);
5777 if ((size == 8 || size == 16 || size == 32)
5778 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
5779 && TYPE_VECTOR_SUBPARTS (type) > 1)
5781 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
5783 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
5784 mode = MIN_MODE_VECTOR_FLOAT;
5785 else
5786 mode = MIN_MODE_VECTOR_INT;
5788 /* Get the mode which has this inner mode and number of units. */
5789 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
5790 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
5791 && GET_MODE_INNER (mode) == innermode)
5793 if (size == 32 && !TARGET_AVX)
5795 static bool warnedavx;
5797 if (cum
5798 && !warnedavx
5799 && cum->warn_avx)
5801 warnedavx = true;
5802 warning (0, "AVX vector argument without AVX "
5803 "enabled changes the ABI");
5805 return TYPE_MODE (type);
5807 else if ((size == 8 || size == 16) && !TARGET_SSE)
5809 static bool warnedsse;
5811 if (cum
5812 && !warnedsse
5813 && cum->warn_sse)
5815 warnedsse = true;
5816 warning (0, "SSE vector argument without SSE "
5817 "enabled changes the ABI");
5819 return mode;
5821 else
5822 return mode;
5825 gcc_unreachable ();
5829 return mode;
5832 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
5833 this may not agree with the mode that the type system has chosen for the
5834 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
5835 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
5837 static rtx
5838 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
5839 unsigned int regno)
5841 rtx tmp;
5843 if (orig_mode != BLKmode)
5844 tmp = gen_rtx_REG (orig_mode, regno);
5845 else
5847 tmp = gen_rtx_REG (mode, regno);
5848 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
5849 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
5852 return tmp;
5855 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
5856 of this code is to classify each 8bytes of incoming argument by the register
5857 class and assign registers accordingly. */
5859 /* Return the union class of CLASS1 and CLASS2.
5860 See the x86-64 PS ABI for details. */
5862 static enum x86_64_reg_class
5863 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
5865 /* Rule #1: If both classes are equal, this is the resulting class. */
5866 if (class1 == class2)
5867 return class1;
5869 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
5870 the other class. */
5871 if (class1 == X86_64_NO_CLASS)
5872 return class2;
5873 if (class2 == X86_64_NO_CLASS)
5874 return class1;
5876 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
5877 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
5878 return X86_64_MEMORY_CLASS;
5880 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
5881 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
5882 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
5883 return X86_64_INTEGERSI_CLASS;
5884 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
5885 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
5886 return X86_64_INTEGER_CLASS;
5888 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
5889 MEMORY is used. */
5890 if (class1 == X86_64_X87_CLASS
5891 || class1 == X86_64_X87UP_CLASS
5892 || class1 == X86_64_COMPLEX_X87_CLASS
5893 || class2 == X86_64_X87_CLASS
5894 || class2 == X86_64_X87UP_CLASS
5895 || class2 == X86_64_COMPLEX_X87_CLASS)
5896 return X86_64_MEMORY_CLASS;
5898 /* Rule #6: Otherwise class SSE is used. */
5899 return X86_64_SSE_CLASS;
5902 /* Classify the argument of type TYPE and mode MODE.
5903 CLASSES will be filled by the register class used to pass each word
5904 of the operand. The number of words is returned. In case the parameter
5905 should be passed in memory, 0 is returned. As a special case for zero
5906 sized containers, classes[0] will be NO_CLASS and 1 is returned.
5908 BIT_OFFSET is used internally for handling records and specifies offset
5909 of the offset in bits modulo 256 to avoid overflow cases.
5911 See the x86-64 PS ABI for details.
5914 static int
5915 classify_argument (enum machine_mode mode, const_tree type,
5916 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
5918 HOST_WIDE_INT bytes =
5919 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
5920 int words
5921 = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5923 /* Variable sized entities are always passed/returned in memory. */
5924 if (bytes < 0)
5925 return 0;
5927 if (mode != VOIDmode
5928 && targetm.calls.must_pass_in_stack (mode, type))
5929 return 0;
5931 /* Special case check for pointer to shared, on 64-bit target. */
5932 if (TARGET_64BIT && mode == TImode
5933 && type && TREE_CODE (type) == POINTER_TYPE
5934 && upc_shared_type_p (TREE_TYPE (type)))
5936 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
5937 return 2;
5940 if (type && AGGREGATE_TYPE_P (type))
5942 int i;
5943 tree field;
5944 enum x86_64_reg_class subclasses[MAX_CLASSES];
5946 /* On x86-64 we pass structures larger than 32 bytes on the stack. */
5947 if (bytes > 32)
5948 return 0;
5950 for (i = 0; i < words; i++)
5951 classes[i] = X86_64_NO_CLASS;
5953 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
5954 signalize memory class, so handle it as special case. */
5955 if (!words)
5957 classes[0] = X86_64_NO_CLASS;
5958 return 1;
5961 /* Classify each field of record and merge classes. */
5962 switch (TREE_CODE (type))
5964 case RECORD_TYPE:
5965 /* And now merge the fields of structure. */
5966 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5968 if (TREE_CODE (field) == FIELD_DECL)
5970 int num;
5972 if (TREE_TYPE (field) == error_mark_node)
5973 continue;
5975 /* Bitfields are always classified as integer. Handle them
5976 early, since later code would consider them to be
5977 misaligned integers. */
5978 if (DECL_BIT_FIELD (field))
5980 for (i = (int_bit_position (field)
5981 + (bit_offset % 64)) / 8 / 8;
5982 i < ((int_bit_position (field) + (bit_offset % 64))
5983 + tree_low_cst (DECL_SIZE (field), 0)
5984 + 63) / 8 / 8; i++)
5985 classes[i] =
5986 merge_classes (X86_64_INTEGER_CLASS,
5987 classes[i]);
5989 else
5991 int pos;
5993 type = TREE_TYPE (field);
5995 /* Flexible array member is ignored. */
5996 if (TYPE_MODE (type) == BLKmode
5997 && TREE_CODE (type) == ARRAY_TYPE
5998 && TYPE_SIZE (type) == NULL_TREE
5999 && TYPE_DOMAIN (type) != NULL_TREE
6000 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
6001 == NULL_TREE))
6003 static bool warned;
6005 if (!warned && warn_psabi)
6007 warned = true;
6008 inform (input_location,
6009 "the ABI of passing struct with"
6010 " a flexible array member has"
6011 " changed in GCC 4.4");
6013 continue;
6015 num = classify_argument (TYPE_MODE (type), type,
6016 subclasses,
6017 (int_bit_position (field)
6018 + bit_offset) % 256);
6019 if (!num)
6020 return 0;
6021 pos = (int_bit_position (field)
6022 + (bit_offset % 64)) / 8 / 8;
6023 for (i = 0; i < num && (i + pos) < words; i++)
6024 classes[i + pos] =
6025 merge_classes (subclasses[i], classes[i + pos]);
6029 break;
6031 case ARRAY_TYPE:
6032 /* Arrays are handled as small records. */
6034 int num;
6035 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
6036 TREE_TYPE (type), subclasses, bit_offset);
6037 if (!num)
6038 return 0;
6040 /* The partial classes are now full classes. */
6041 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
6042 subclasses[0] = X86_64_SSE_CLASS;
6043 if (subclasses[0] == X86_64_INTEGERSI_CLASS
6044 && !((bit_offset % 64) == 0 && bytes == 4))
6045 subclasses[0] = X86_64_INTEGER_CLASS;
6047 for (i = 0; i < words; i++)
6048 classes[i] = subclasses[i % num];
6050 break;
6052 case UNION_TYPE:
6053 case QUAL_UNION_TYPE:
6054 /* Unions are similar to RECORD_TYPE but offset is always 0.
6056 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6058 if (TREE_CODE (field) == FIELD_DECL)
6060 int num;
6062 if (TREE_TYPE (field) == error_mark_node)
6063 continue;
6065 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
6066 TREE_TYPE (field), subclasses,
6067 bit_offset);
6068 if (!num)
6069 return 0;
6070 for (i = 0; i < num; i++)
6071 classes[i] = merge_classes (subclasses[i], classes[i]);
6074 break;
6076 default:
6077 gcc_unreachable ();
6080 if (words > 2)
6082 /* When size > 16 bytes, if the first one isn't
6083 X86_64_SSE_CLASS or any other ones aren't
6084 X86_64_SSEUP_CLASS, everything should be passed in
6085 memory. */
6086 if (classes[0] != X86_64_SSE_CLASS)
6087 return 0;
6089 for (i = 1; i < words; i++)
6090 if (classes[i] != X86_64_SSEUP_CLASS)
6091 return 0;
6094 /* Final merger cleanup. */
6095 for (i = 0; i < words; i++)
6097 /* If one class is MEMORY, everything should be passed in
6098 memory. */
6099 if (classes[i] == X86_64_MEMORY_CLASS)
6100 return 0;
6102 /* The X86_64_SSEUP_CLASS should be always preceded by
6103 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
6104 if (classes[i] == X86_64_SSEUP_CLASS
6105 && classes[i - 1] != X86_64_SSE_CLASS
6106 && classes[i - 1] != X86_64_SSEUP_CLASS)
6108 /* The first one should never be X86_64_SSEUP_CLASS. */
6109 gcc_assert (i != 0);
6110 classes[i] = X86_64_SSE_CLASS;
6113 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
6114 everything should be passed in memory. */
6115 if (classes[i] == X86_64_X87UP_CLASS
6116 && (classes[i - 1] != X86_64_X87_CLASS))
6118 static bool warned;
6120 /* The first one should never be X86_64_X87UP_CLASS. */
6121 gcc_assert (i != 0);
6122 if (!warned && warn_psabi)
6124 warned = true;
6125 inform (input_location,
6126 "the ABI of passing union with long double"
6127 " has changed in GCC 4.4");
6129 return 0;
6132 return words;
6135 /* Compute alignment needed. We align all types to natural boundaries with
6136 exception of XFmode that is aligned to 64bits. */
6137 if (mode != VOIDmode && mode != BLKmode)
6139 int mode_alignment = GET_MODE_BITSIZE (mode);
6141 if (mode == XFmode)
6142 mode_alignment = 128;
6143 else if (mode == XCmode)
6144 mode_alignment = 256;
6145 if (COMPLEX_MODE_P (mode))
6146 mode_alignment /= 2;
6147 /* Misaligned fields are always returned in memory. */
6148 if (bit_offset % mode_alignment)
6149 return 0;
6152 /* for V1xx modes, just use the base mode */
6153 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
6154 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
6155 mode = GET_MODE_INNER (mode);
6157 /* Classification of atomic types. */
6158 switch (mode)
6160 case SDmode:
6161 case DDmode:
6162 classes[0] = X86_64_SSE_CLASS;
6163 return 1;
6164 case TDmode:
6165 classes[0] = X86_64_SSE_CLASS;
6166 classes[1] = X86_64_SSEUP_CLASS;
6167 return 2;
6168 case DImode:
6169 case SImode:
6170 case HImode:
6171 case QImode:
6172 case CSImode:
6173 case CHImode:
6174 case CQImode:
6176 int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
6178 if (size <= 32)
6180 classes[0] = X86_64_INTEGERSI_CLASS;
6181 return 1;
6183 else if (size <= 64)
6185 classes[0] = X86_64_INTEGER_CLASS;
6186 return 1;
6188 else if (size <= 64+32)
6190 classes[0] = X86_64_INTEGER_CLASS;
6191 classes[1] = X86_64_INTEGERSI_CLASS;
6192 return 2;
6194 else if (size <= 64+64)
6196 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6197 return 2;
6199 else
6200 gcc_unreachable ();
6202 case CDImode:
6203 case TImode:
6204 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6205 return 2;
6206 case COImode:
6207 case OImode:
6208 /* OImode shouldn't be used directly. */
6209 gcc_unreachable ();
6210 case CTImode:
6211 return 0;
6212 case SFmode:
6213 if (!(bit_offset % 64))
6214 classes[0] = X86_64_SSESF_CLASS;
6215 else
6216 classes[0] = X86_64_SSE_CLASS;
6217 return 1;
6218 case DFmode:
6219 classes[0] = X86_64_SSEDF_CLASS;
6220 return 1;
6221 case XFmode:
6222 classes[0] = X86_64_X87_CLASS;
6223 classes[1] = X86_64_X87UP_CLASS;
6224 return 2;
6225 case TFmode:
6226 classes[0] = X86_64_SSE_CLASS;
6227 classes[1] = X86_64_SSEUP_CLASS;
6228 return 2;
6229 case SCmode:
6230 classes[0] = X86_64_SSE_CLASS;
6231 if (!(bit_offset % 64))
6232 return 1;
6233 else
6235 static bool warned;
6237 if (!warned && warn_psabi)
6239 warned = true;
6240 inform (input_location,
6241 "the ABI of passing structure with complex float"
6242 " member has changed in GCC 4.4");
6244 classes[1] = X86_64_SSESF_CLASS;
6245 return 2;
6247 case DCmode:
6248 classes[0] = X86_64_SSEDF_CLASS;
6249 classes[1] = X86_64_SSEDF_CLASS;
6250 return 2;
6251 case XCmode:
6252 classes[0] = X86_64_COMPLEX_X87_CLASS;
6253 return 1;
6254 case TCmode:
6255 /* This modes is larger than 16 bytes. */
6256 return 0;
6257 case V8SFmode:
6258 case V8SImode:
6259 case V32QImode:
6260 case V16HImode:
6261 case V4DFmode:
6262 case V4DImode:
6263 classes[0] = X86_64_SSE_CLASS;
6264 classes[1] = X86_64_SSEUP_CLASS;
6265 classes[2] = X86_64_SSEUP_CLASS;
6266 classes[3] = X86_64_SSEUP_CLASS;
6267 return 4;
6268 case V4SFmode:
6269 case V4SImode:
6270 case V16QImode:
6271 case V8HImode:
6272 case V2DFmode:
6273 case V2DImode:
6274 classes[0] = X86_64_SSE_CLASS;
6275 classes[1] = X86_64_SSEUP_CLASS;
6276 return 2;
6277 case V1TImode:
6278 case V1DImode:
6279 case V2SFmode:
6280 case V2SImode:
6281 case V4HImode:
6282 case V8QImode:
6283 classes[0] = X86_64_SSE_CLASS;
6284 return 1;
6285 case BLKmode:
6286 case VOIDmode:
6287 return 0;
6288 default:
6289 gcc_assert (VECTOR_MODE_P (mode));
6291 if (bytes > 16)
6292 return 0;
6294 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6296 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6297 classes[0] = X86_64_INTEGERSI_CLASS;
6298 else
6299 classes[0] = X86_64_INTEGER_CLASS;
6300 classes[1] = X86_64_INTEGER_CLASS;
6301 return 1 + (bytes > 8);
6305 /* Examine the argument and return set number of register required in each
6306 class. Return 0 iff parameter should be passed in memory. */
6307 static int
6308 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6309 int *int_nregs, int *sse_nregs)
6311 enum x86_64_reg_class regclass[MAX_CLASSES];
6312 int n = classify_argument (mode, type, regclass, 0);
6314 *int_nregs = 0;
6315 *sse_nregs = 0;
6316 if (!n)
6317 return 0;
6318 for (n--; n >= 0; n--)
6319 switch (regclass[n])
6321 case X86_64_INTEGER_CLASS:
6322 case X86_64_INTEGERSI_CLASS:
6323 (*int_nregs)++;
6324 break;
6325 case X86_64_SSE_CLASS:
6326 case X86_64_SSESF_CLASS:
6327 case X86_64_SSEDF_CLASS:
6328 (*sse_nregs)++;
6329 break;
6330 case X86_64_NO_CLASS:
6331 case X86_64_SSEUP_CLASS:
6332 break;
6333 case X86_64_X87_CLASS:
6334 case X86_64_X87UP_CLASS:
6335 if (!in_return)
6336 return 0;
6337 break;
6338 case X86_64_COMPLEX_X87_CLASS:
6339 return in_return ? 2 : 0;
6340 case X86_64_MEMORY_CLASS:
6341 gcc_unreachable ();
6343 return 1;
6346 /* Construct container for the argument used by GCC interface. See
6347 FUNCTION_ARG for the detailed description. */
6349 static rtx
6350 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6351 const_tree type, int in_return, int nintregs, int nsseregs,
6352 const int *intreg, int sse_regno)
6354 /* The following variables hold the static issued_error state. */
6355 static bool issued_sse_arg_error;
6356 static bool issued_sse_ret_error;
6357 static bool issued_x87_ret_error;
6359 enum machine_mode tmpmode;
6360 int bytes =
6361 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6362 enum x86_64_reg_class regclass[MAX_CLASSES];
6363 int n;
6364 int i;
6365 int nexps = 0;
6366 int needed_sseregs, needed_intregs;
6367 rtx exp[MAX_CLASSES];
6368 rtx ret;
6370 n = classify_argument (mode, type, regclass, 0);
6371 if (!n)
6372 return NULL;
6373 if (!examine_argument (mode, type, in_return, &needed_intregs,
6374 &needed_sseregs))
6375 return NULL;
6376 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6377 return NULL;
6379 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
6380 some less clueful developer tries to use floating-point anyway. */
6381 if (needed_sseregs && !TARGET_SSE)
6383 if (in_return)
6385 if (!issued_sse_ret_error)
6387 error ("SSE register return with SSE disabled");
6388 issued_sse_ret_error = true;
6391 else if (!issued_sse_arg_error)
6393 error ("SSE register argument with SSE disabled");
6394 issued_sse_arg_error = true;
6396 return NULL;
6399 /* Likewise, error if the ABI requires us to return values in the
6400 x87 registers and the user specified -mno-80387. */
6401 if (!TARGET_80387 && in_return)
6402 for (i = 0; i < n; i++)
6403 if (regclass[i] == X86_64_X87_CLASS
6404 || regclass[i] == X86_64_X87UP_CLASS
6405 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6407 if (!issued_x87_ret_error)
6409 error ("x87 register return with x87 disabled");
6410 issued_x87_ret_error = true;
6412 return NULL;
6415 /* First construct simple cases. Avoid SCmode, since we want to use
6416 single register to pass this type. */
6417 if (n == 1 && mode != SCmode)
6418 switch (regclass[0])
6420 case X86_64_INTEGER_CLASS:
6421 case X86_64_INTEGERSI_CLASS:
6422 return gen_rtx_REG (mode, intreg[0]);
6423 case X86_64_SSE_CLASS:
6424 case X86_64_SSESF_CLASS:
6425 case X86_64_SSEDF_CLASS:
6426 if (mode != BLKmode)
6427 return gen_reg_or_parallel (mode, orig_mode,
6428 SSE_REGNO (sse_regno));
6429 break;
6430 case X86_64_X87_CLASS:
6431 case X86_64_COMPLEX_X87_CLASS:
6432 return gen_rtx_REG (mode, FIRST_STACK_REG);
6433 case X86_64_NO_CLASS:
6434 /* Zero sized array, struct or class. */
6435 return NULL;
6436 default:
6437 gcc_unreachable ();
6439 if (n == 2
6440 && regclass[0] == X86_64_SSE_CLASS
6441 && regclass[1] == X86_64_SSEUP_CLASS
6442 && mode != BLKmode)
6443 return gen_reg_or_parallel (mode, orig_mode,
6444 SSE_REGNO (sse_regno));
6445 if (n == 4
6446 && regclass[0] == X86_64_SSE_CLASS
6447 && regclass[1] == X86_64_SSEUP_CLASS
6448 && regclass[2] == X86_64_SSEUP_CLASS
6449 && regclass[3] == X86_64_SSEUP_CLASS
6450 && mode != BLKmode)
6451 return gen_reg_or_parallel (mode, orig_mode,
6452 SSE_REGNO (sse_regno));
6453 if (n == 2
6454 && regclass[0] == X86_64_X87_CLASS
6455 && regclass[1] == X86_64_X87UP_CLASS)
6456 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6458 if (n == 2
6459 && regclass[0] == X86_64_INTEGER_CLASS
6460 && regclass[1] == X86_64_INTEGER_CLASS
6461 && (mode == CDImode || mode == TImode || mode == TFmode)
6462 && intreg[0] + 1 == intreg[1])
6463 return gen_rtx_REG (mode, intreg[0]);
6465 /* Otherwise figure out the entries of the PARALLEL. */
6466 for (i = 0; i < n; i++)
6468 int pos;
6470 switch (regclass[i])
6472 case X86_64_NO_CLASS:
6473 break;
6474 case X86_64_INTEGER_CLASS:
6475 case X86_64_INTEGERSI_CLASS:
6476 /* Merge TImodes on aligned occasions here too. */
6477 if (i * 8 + 8 > bytes)
6478 tmpmode
6479 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6480 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6481 tmpmode = SImode;
6482 else
6483 tmpmode = DImode;
6484 /* We've requested 24 bytes we
6485 don't have mode for. Use DImode. */
6486 if (tmpmode == BLKmode)
6487 tmpmode = DImode;
6488 exp [nexps++]
6489 = gen_rtx_EXPR_LIST (VOIDmode,
6490 gen_rtx_REG (tmpmode, *intreg),
6491 GEN_INT (i*8));
6492 intreg++;
6493 break;
6494 case X86_64_SSESF_CLASS:
6495 exp [nexps++]
6496 = gen_rtx_EXPR_LIST (VOIDmode,
6497 gen_rtx_REG (SFmode,
6498 SSE_REGNO (sse_regno)),
6499 GEN_INT (i*8));
6500 sse_regno++;
6501 break;
6502 case X86_64_SSEDF_CLASS:
6503 exp [nexps++]
6504 = gen_rtx_EXPR_LIST (VOIDmode,
6505 gen_rtx_REG (DFmode,
6506 SSE_REGNO (sse_regno)),
6507 GEN_INT (i*8));
6508 sse_regno++;
6509 break;
6510 case X86_64_SSE_CLASS:
6511 pos = i;
6512 switch (n)
6514 case 1:
6515 tmpmode = DImode;
6516 break;
6517 case 2:
6518 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6520 tmpmode = TImode;
6521 i++;
6523 else
6524 tmpmode = DImode;
6525 break;
6526 case 4:
6527 gcc_assert (i == 0
6528 && regclass[1] == X86_64_SSEUP_CLASS
6529 && regclass[2] == X86_64_SSEUP_CLASS
6530 && regclass[3] == X86_64_SSEUP_CLASS);
6531 tmpmode = OImode;
6532 i += 3;
6533 break;
6534 default:
6535 gcc_unreachable ();
6537 exp [nexps++]
6538 = gen_rtx_EXPR_LIST (VOIDmode,
6539 gen_rtx_REG (tmpmode,
6540 SSE_REGNO (sse_regno)),
6541 GEN_INT (pos*8));
6542 sse_regno++;
6543 break;
6544 default:
6545 gcc_unreachable ();
6549 /* Empty aligned struct, union or class. */
6550 if (nexps == 0)
6551 return NULL;
6553 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6554 for (i = 0; i < nexps; i++)
6555 XVECEXP (ret, 0, i) = exp [i];
6556 return ret;
6559 /* Update the data in CUM to advance over an argument of mode MODE
6560 and data type TYPE. (TYPE is null for libcalls where that information
6561 may not be available.) */
6563 static void
6564 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6565 const_tree type, HOST_WIDE_INT bytes,
6566 HOST_WIDE_INT words)
6568 switch (mode)
6570 default:
6571 break;
6573 case BLKmode:
6574 if (bytes < 0)
6575 break;
6576 /* FALLTHRU */
6578 case DImode:
6579 case SImode:
6580 case HImode:
6581 case QImode:
6582 cum->words += words;
6583 cum->nregs -= words;
6584 cum->regno += words;
6586 if (cum->nregs <= 0)
6588 cum->nregs = 0;
6589 cum->regno = 0;
6591 break;
6593 case OImode:
6594 /* OImode shouldn't be used directly. */
6595 gcc_unreachable ();
6597 case DFmode:
6598 if (cum->float_in_sse < 2)
6599 break;
6600 case SFmode:
6601 if (cum->float_in_sse < 1)
6602 break;
6603 /* FALLTHRU */
6605 case V8SFmode:
6606 case V8SImode:
6607 case V32QImode:
6608 case V16HImode:
6609 case V4DFmode:
6610 case V4DImode:
6611 case TImode:
6612 case V16QImode:
6613 case V8HImode:
6614 case V4SImode:
6615 case V2DImode:
6616 case V4SFmode:
6617 case V2DFmode:
6618 if (!type || !AGGREGATE_TYPE_P (type))
6620 cum->sse_words += words;
6621 cum->sse_nregs -= 1;
6622 cum->sse_regno += 1;
6623 if (cum->sse_nregs <= 0)
6625 cum->sse_nregs = 0;
6626 cum->sse_regno = 0;
6629 break;
6631 case V8QImode:
6632 case V4HImode:
6633 case V2SImode:
6634 case V2SFmode:
6635 case V1TImode:
6636 case V1DImode:
6637 if (!type || !AGGREGATE_TYPE_P (type))
6639 cum->mmx_words += words;
6640 cum->mmx_nregs -= 1;
6641 cum->mmx_regno += 1;
6642 if (cum->mmx_nregs <= 0)
6644 cum->mmx_nregs = 0;
6645 cum->mmx_regno = 0;
6648 break;
6652 static void
6653 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6654 const_tree type, HOST_WIDE_INT words, bool named)
6656 int int_nregs, sse_nregs;
6658 /* Unnamed 256bit vector mode parameters are passed on stack. */
6659 if (!named && VALID_AVX256_REG_MODE (mode))
6660 return;
6662 if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
6663 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
6665 cum->nregs -= int_nregs;
6666 cum->sse_nregs -= sse_nregs;
6667 cum->regno += int_nregs;
6668 cum->sse_regno += sse_nregs;
6670 else
6672 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
6673 cum->words = (cum->words + align - 1) & ~(align - 1);
6674 cum->words += words;
6678 static void
6679 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
6680 HOST_WIDE_INT words)
6682 /* Otherwise, this should be passed indirect. */
6683 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
6685 cum->words += words;
6686 if (cum->nregs > 0)
6688 cum->nregs -= 1;
6689 cum->regno += 1;
6693 /* Update the data in CUM to advance over an argument of mode MODE and
6694 data type TYPE. (TYPE is null for libcalls where that information
6695 may not be available.) */
6697 static void
6698 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
6699 const_tree type, bool named)
6701 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6702 HOST_WIDE_INT bytes, words;
6704 if (mode == BLKmode)
6705 bytes = int_size_in_bytes (type);
6706 else
6707 bytes = GET_MODE_SIZE (mode);
6708 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6710 if (type)
6711 mode = type_natural_mode (type, NULL);
6713 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6714 function_arg_advance_ms_64 (cum, bytes, words);
6715 else if (TARGET_64BIT)
6716 function_arg_advance_64 (cum, mode, type, words, named);
6717 else
6718 function_arg_advance_32 (cum, mode, type, bytes, words);
6721 /* Define where to put the arguments to a function.
6722 Value is zero to push the argument on the stack,
6723 or a hard register in which to store the argument.
6725 MODE is the argument's machine mode.
6726 TYPE is the data type of the argument (as a tree).
6727 This is null for libcalls where that information may
6728 not be available.
6729 CUM is a variable of type CUMULATIVE_ARGS which gives info about
6730 the preceding args and about the function being called.
6731 NAMED is nonzero if this argument is a named parameter
6732 (otherwise it is an extra parameter matching an ellipsis). */
6734 static rtx
6735 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6736 enum machine_mode orig_mode, const_tree type,
6737 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
6739 static bool warnedsse, warnedmmx;
6741 /* Avoid the AL settings for the Unix64 ABI. */
6742 if (mode == VOIDmode)
6743 return constm1_rtx;
6745 switch (mode)
6747 default:
6748 break;
6750 case BLKmode:
6751 if (bytes < 0)
6752 break;
6753 /* FALLTHRU */
6754 case DImode:
6755 case SImode:
6756 case HImode:
6757 case QImode:
6758 if (words <= cum->nregs)
6760 int regno = cum->regno;
6762 /* Fastcall allocates the first two DWORD (SImode) or
6763 smaller arguments to ECX and EDX if it isn't an
6764 aggregate type . */
6765 if (cum->fastcall)
6767 if (mode == BLKmode
6768 || mode == DImode
6769 || (type && AGGREGATE_TYPE_P (type)))
6770 break;
6772 /* ECX not EAX is the first allocated register. */
6773 if (regno == AX_REG)
6774 regno = CX_REG;
6776 return gen_rtx_REG (mode, regno);
6778 break;
6780 case DFmode:
6781 if (cum->float_in_sse < 2)
6782 break;
6783 case SFmode:
6784 if (cum->float_in_sse < 1)
6785 break;
6786 /* FALLTHRU */
6787 case TImode:
6788 /* In 32bit, we pass TImode in xmm registers. */
6789 case V16QImode:
6790 case V8HImode:
6791 case V4SImode:
6792 case V2DImode:
6793 case V4SFmode:
6794 case V2DFmode:
6795 if (!type || !AGGREGATE_TYPE_P (type))
6797 if (!TARGET_SSE && !warnedsse && cum->warn_sse)
6799 warnedsse = true;
6800 warning (0, "SSE vector argument without SSE enabled "
6801 "changes the ABI");
6803 if (cum->sse_nregs)
6804 return gen_reg_or_parallel (mode, orig_mode,
6805 cum->sse_regno + FIRST_SSE_REG);
6807 break;
6809 case OImode:
6810 /* OImode shouldn't be used directly. */
6811 gcc_unreachable ();
6813 case V8SFmode:
6814 case V8SImode:
6815 case V32QImode:
6816 case V16HImode:
6817 case V4DFmode:
6818 case V4DImode:
6819 if (!type || !AGGREGATE_TYPE_P (type))
6821 if (cum->sse_nregs)
6822 return gen_reg_or_parallel (mode, orig_mode,
6823 cum->sse_regno + FIRST_SSE_REG);
6825 break;
6827 case V8QImode:
6828 case V4HImode:
6829 case V2SImode:
6830 case V2SFmode:
6831 case V1TImode:
6832 case V1DImode:
6833 if (!type || !AGGREGATE_TYPE_P (type))
6835 if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
6837 warnedmmx = true;
6838 warning (0, "MMX vector argument without MMX enabled "
6839 "changes the ABI");
6841 if (cum->mmx_nregs)
6842 return gen_reg_or_parallel (mode, orig_mode,
6843 cum->mmx_regno + FIRST_MMX_REG);
6845 break;
6848 return NULL_RTX;
6851 static rtx
6852 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6853 enum machine_mode orig_mode, const_tree type, bool named)
6855 /* Handle a hidden AL argument containing number of registers
6856 for varargs x86-64 functions. */
6857 if (mode == VOIDmode)
6858 return GEN_INT (cum->maybe_vaarg
6859 ? (cum->sse_nregs < 0
6860 ? X86_64_SSE_REGPARM_MAX
6861 : cum->sse_regno)
6862 : -1);
6864 switch (mode)
6866 default:
6867 break;
6869 case V8SFmode:
6870 case V8SImode:
6871 case V32QImode:
6872 case V16HImode:
6873 case V4DFmode:
6874 case V4DImode:
6875 /* Unnamed 256bit vector mode parameters are passed on stack. */
6876 if (!named)
6877 return NULL;
6878 break;
6881 return construct_container (mode, orig_mode, type, 0, cum->nregs,
6882 cum->sse_nregs,
6883 &x86_64_int_parameter_registers [cum->regno],
6884 cum->sse_regno);
6887 static rtx
6888 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6889 enum machine_mode orig_mode, bool named,
6890 HOST_WIDE_INT bytes)
6892 unsigned int regno;
6894 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
6895 We use value of -2 to specify that current function call is MSABI. */
6896 if (mode == VOIDmode)
6897 return GEN_INT (-2);
6899 /* If we've run out of registers, it goes on the stack. */
6900 if (cum->nregs == 0)
6901 return NULL_RTX;
6903 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
6905 /* Only floating point modes are passed in anything but integer regs. */
6906 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
6908 if (named)
6909 regno = cum->regno + FIRST_SSE_REG;
6910 else
6912 rtx t1, t2;
6914 /* Unnamed floating parameters are passed in both the
6915 SSE and integer registers. */
6916 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
6917 t2 = gen_rtx_REG (mode, regno);
6918 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
6919 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
6920 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
6923 /* Handle aggregated types passed in register. */
6924 if (orig_mode == BLKmode)
6926 if (bytes > 0 && bytes <= 8)
6927 mode = (bytes > 4 ? DImode : SImode);
6928 if (mode == BLKmode)
6929 mode = DImode;
6932 return gen_reg_or_parallel (mode, orig_mode, regno);
6935 /* Return where to put the arguments to a function.
6936 Return zero to push the argument on the stack, or a hard register in which to store the argument.
6938 MODE is the argument's machine mode. TYPE is the data type of the
6939 argument. It is null for libcalls where that information may not be
6940 available. CUM gives information about the preceding args and about
6941 the function being called. NAMED is nonzero if this argument is a
6942 named parameter (otherwise it is an extra parameter matching an
6943 ellipsis). */
6945 static rtx
6946 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
6947 const_tree type, bool named)
6949 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6950 enum machine_mode mode = omode;
6951 HOST_WIDE_INT bytes, words;
6952 rtx arg;
6954 if (mode == BLKmode)
6955 bytes = int_size_in_bytes (type);
6956 else
6957 bytes = GET_MODE_SIZE (mode);
6958 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6960 /* To simplify the code below, represent vector types with a vector mode
6961 even if MMX/SSE are not active. */
6962 if (type && TREE_CODE (type) == VECTOR_TYPE)
6963 mode = type_natural_mode (type, cum);
6965 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6966 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
6967 else if (TARGET_64BIT)
6968 arg = function_arg_64 (cum, mode, omode, type, named);
6969 else
6970 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
6972 return arg;
6975 /* A C expression that indicates when an argument must be passed by
6976 reference. If nonzero for an argument, a copy of that argument is
6977 made in memory and a pointer to the argument is passed instead of
6978 the argument itself. The pointer is passed in whatever way is
6979 appropriate for passing a pointer to that type. */
6981 static bool
6982 ix86_pass_by_reference (cumulative_args_t cum_v ATTRIBUTE_UNUSED,
6983 enum machine_mode mode ATTRIBUTE_UNUSED,
6984 const_tree type, bool named ATTRIBUTE_UNUSED)
6986 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6988 /* See Windows x64 Software Convention. */
6989 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6991 int msize = (int) GET_MODE_SIZE (mode);
6992 if (type)
6994 /* Arrays are passed by reference. */
6995 if (TREE_CODE (type) == ARRAY_TYPE)
6996 return true;
6998 if (AGGREGATE_TYPE_P (type))
7000 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
7001 are passed by reference. */
7002 msize = int_size_in_bytes (type);
7006 /* __m128 is passed by reference. */
7007 switch (msize) {
7008 case 1: case 2: case 4: case 8:
7009 break;
7010 default:
7011 return true;
7014 else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
7015 return 1;
7017 return 0;
7020 /* Return true when TYPE should be 128bit aligned for 32bit argument
7021 passing ABI. XXX: This function is obsolete and is only used for
7022 checking psABI compatibility with previous versions of GCC. */
7024 static bool
7025 ix86_compat_aligned_value_p (const_tree type)
7027 enum machine_mode mode = TYPE_MODE (type);
7028 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
7029 || mode == TDmode
7030 || mode == TFmode
7031 || mode == TCmode)
7032 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
7033 return true;
7034 if (TYPE_ALIGN (type) < 128)
7035 return false;
7037 if (AGGREGATE_TYPE_P (type))
7039 /* Walk the aggregates recursively. */
7040 switch (TREE_CODE (type))
7042 case RECORD_TYPE:
7043 case UNION_TYPE:
7044 case QUAL_UNION_TYPE:
7046 tree field;
7048 /* Walk all the structure fields. */
7049 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7051 if (TREE_CODE (field) == FIELD_DECL
7052 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
7053 return true;
7055 break;
7058 case ARRAY_TYPE:
7059 /* Just for use if some languages passes arrays by value. */
7060 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
7061 return true;
7062 break;
7064 default:
7065 gcc_unreachable ();
7068 return false;
7071 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
7072 XXX: This function is obsolete and is only used for checking psABI
7073 compatibility with previous versions of GCC. */
7075 static unsigned int
7076 ix86_compat_function_arg_boundary (enum machine_mode mode,
7077 const_tree type, unsigned int align)
7079 /* In 32bit, only _Decimal128 and __float128 are aligned to their
7080 natural boundaries. */
7081 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
7083 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
7084 make an exception for SSE modes since these require 128bit
7085 alignment.
7087 The handling here differs from field_alignment. ICC aligns MMX
7088 arguments to 4 byte boundaries, while structure fields are aligned
7089 to 8 byte boundaries. */
7090 if (!type)
7092 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
7093 align = PARM_BOUNDARY;
7095 else
7097 if (!ix86_compat_aligned_value_p (type))
7098 align = PARM_BOUNDARY;
7101 if (align > BIGGEST_ALIGNMENT)
7102 align = BIGGEST_ALIGNMENT;
7103 return align;
7106 /* Return true when TYPE should be 128bit aligned for 32bit argument
7107 passing ABI. */
7109 static bool
7110 ix86_contains_aligned_value_p (const_tree type)
7112 enum machine_mode mode = TYPE_MODE (type);
7114 if (mode == XFmode || mode == XCmode)
7115 return false;
7117 if (TYPE_ALIGN (type) < 128)
7118 return false;
7120 if (AGGREGATE_TYPE_P (type))
7122 /* Walk the aggregates recursively. */
7123 switch (TREE_CODE (type))
7125 case RECORD_TYPE:
7126 case UNION_TYPE:
7127 case QUAL_UNION_TYPE:
7129 tree field;
7131 /* Walk all the structure fields. */
7132 for (field = TYPE_FIELDS (type);
7133 field;
7134 field = DECL_CHAIN (field))
7136 if (TREE_CODE (field) == FIELD_DECL
7137 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
7138 return true;
7140 break;
7143 case ARRAY_TYPE:
7144 /* Just for use if some languages passes arrays by value. */
7145 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
7146 return true;
7147 break;
7149 default:
7150 gcc_unreachable ();
7153 else
7154 return TYPE_ALIGN (type) >= 128;
7156 return false;
7159 /* Gives the alignment boundary, in bits, of an argument with the
7160 specified mode and type. */
7162 static unsigned int
7163 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
7165 unsigned int align;
7166 if (type)
7168 /* Since the main variant type is used for call, we convert it to
7169 the main variant type. */
7170 type = TYPE_MAIN_VARIANT (type);
7171 align = TYPE_ALIGN (type);
7173 else
7174 align = GET_MODE_ALIGNMENT (mode);
7175 if (align < PARM_BOUNDARY)
7176 align = PARM_BOUNDARY;
7177 else
7179 static bool warned;
7180 unsigned int saved_align = align;
7182 if (!TARGET_64BIT)
7184 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
7185 if (!type)
7187 if (mode == XFmode || mode == XCmode)
7188 align = PARM_BOUNDARY;
7190 else if (!ix86_contains_aligned_value_p (type))
7191 align = PARM_BOUNDARY;
7193 if (align < 128)
7194 align = PARM_BOUNDARY;
7197 if (warn_psabi
7198 && !warned
7199 && align != ix86_compat_function_arg_boundary (mode, type,
7200 saved_align))
7202 warned = true;
7203 inform (input_location,
7204 "The ABI for passing parameters with %d-byte"
7205 " alignment has changed in GCC 4.6",
7206 align / BITS_PER_UNIT);
7210 return align;
7213 /* Return true if N is a possible register number of function value. */
7215 static bool
7216 ix86_function_value_regno_p (const unsigned int regno)
7218 switch (regno)
7220 case AX_REG:
7221 return true;
7223 case FIRST_FLOAT_REG:
7224 /* TODO: The function should depend on current function ABI but
7225 builtins.c would need updating then. Therefore we use the
7226 default ABI. */
7227 if (TARGET_64BIT && ix86_abi == MS_ABI)
7228 return false;
7229 return TARGET_FLOAT_RETURNS_IN_80387;
7231 case FIRST_SSE_REG:
7232 return TARGET_SSE;
7234 case FIRST_MMX_REG:
7235 if (TARGET_MACHO || TARGET_64BIT)
7236 return false;
7237 return TARGET_MMX;
7240 return false;
7243 /* Define how to find the value returned by a function.
7244 VALTYPE is the data type of the value (as a tree).
7245 If the precise function being called is known, FUNC is its FUNCTION_DECL;
7246 otherwise, FUNC is 0. */
7248 static rtx
7249 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7250 const_tree fntype, const_tree fn)
7252 unsigned int regno;
7254 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7255 we normally prevent this case when mmx is not available. However
7256 some ABIs may require the result to be returned like DImode. */
7257 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7258 regno = FIRST_MMX_REG;
7260 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
7261 we prevent this case when sse is not available. However some ABIs
7262 may require the result to be returned like integer TImode. */
7263 else if (mode == TImode
7264 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7265 regno = FIRST_SSE_REG;
7267 /* 32-byte vector modes in %ymm0. */
7268 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7269 regno = FIRST_SSE_REG;
7271 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
7272 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7273 regno = FIRST_FLOAT_REG;
7274 else
7275 /* Most things go in %eax. */
7276 regno = AX_REG;
7278 /* Override FP return register with %xmm0 for local functions when
7279 SSE math is enabled or for functions with sseregparm attribute. */
7280 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7282 int sse_level = ix86_function_sseregparm (fntype, fn, false);
7283 if ((sse_level >= 1 && mode == SFmode)
7284 || (sse_level == 2 && mode == DFmode))
7285 regno = FIRST_SSE_REG;
7288 /* OImode shouldn't be used directly. */
7289 gcc_assert (mode != OImode);
7291 return gen_rtx_REG (orig_mode, regno);
7294 static rtx
7295 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7296 const_tree valtype)
7298 rtx ret;
7300 /* Handle libcalls, which don't provide a type node. */
7301 if (valtype == NULL)
7303 unsigned int regno;
7305 switch (mode)
7307 case SFmode:
7308 case SCmode:
7309 case DFmode:
7310 case DCmode:
7311 case TFmode:
7312 case SDmode:
7313 case DDmode:
7314 case TDmode:
7315 regno = FIRST_SSE_REG;
7316 break;
7317 case XFmode:
7318 case XCmode:
7319 regno = FIRST_FLOAT_REG;
7320 break;
7321 case TCmode:
7322 return NULL;
7323 default:
7324 regno = AX_REG;
7327 return gen_rtx_REG (mode, regno);
7329 else if (POINTER_TYPE_P (valtype)
7330 && !upc_shared_type_p (TREE_TYPE (valtype)))
7332 /* Pointers are always returned in word_mode. */
7333 mode = word_mode;
7336 ret = construct_container (mode, orig_mode, valtype, 1,
7337 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7338 x86_64_int_return_registers, 0);
7340 /* For zero sized structures, construct_container returns NULL, but we
7341 need to keep rest of compiler happy by returning meaningful value. */
7342 if (!ret)
7343 ret = gen_rtx_REG (orig_mode, AX_REG);
7345 return ret;
7348 static rtx
7349 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode,
7350 const_tree valtype)
7352 unsigned int regno = AX_REG;
7354 if (TARGET_SSE)
7356 switch (GET_MODE_SIZE (mode))
7358 case 16:
7359 if (valtype != NULL_TREE
7360 && !VECTOR_INTEGER_TYPE_P (valtype)
7361 && !VECTOR_INTEGER_TYPE_P (valtype)
7362 && !INTEGRAL_TYPE_P (valtype)
7363 && !VECTOR_FLOAT_TYPE_P (valtype))
7364 break;
7365 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7366 && !COMPLEX_MODE_P (mode))
7367 regno = FIRST_SSE_REG;
7368 break;
7369 case 8:
7370 case 4:
7371 if (mode == SFmode || mode == DFmode)
7372 regno = FIRST_SSE_REG;
7373 break;
7374 default:
7375 break;
7378 return gen_rtx_REG (orig_mode, regno);
7381 static rtx
7382 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7383 enum machine_mode orig_mode, enum machine_mode mode)
7385 const_tree fn, fntype;
7387 fn = NULL_TREE;
7388 if (fntype_or_decl && DECL_P (fntype_or_decl))
7389 fn = fntype_or_decl;
7390 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7392 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7393 return function_value_ms_64 (orig_mode, mode, valtype);
7394 else if (TARGET_64BIT)
7395 return function_value_64 (orig_mode, mode, valtype);
7396 else
7397 return function_value_32 (orig_mode, mode, fntype, fn);
7400 static rtx
7401 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7402 bool outgoing ATTRIBUTE_UNUSED)
7404 enum machine_mode mode, orig_mode;
7406 orig_mode = TYPE_MODE (valtype);
7407 mode = type_natural_mode (valtype, NULL);
7408 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7411 /* Pointer function arguments and return values are promoted to
7412 word_mode. */
7414 static enum machine_mode
7415 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7416 int *punsignedp, const_tree fntype,
7417 int for_return)
7419 if (type != NULL_TREE && POINTER_TYPE_P (type))
7421 if (upc_shared_type_p (TREE_TYPE (type)))
7423 *punsignedp = 1;
7424 return TYPE_MODE (upc_pts_rep_type_node);
7426 *punsignedp = POINTERS_EXTEND_UNSIGNED;
7427 return word_mode;
7429 return default_promote_function_mode (type, mode, punsignedp, fntype,
7430 for_return);
7433 /* Return true if a structure, union or array with MODE containing FIELD
7434 should be accessed using BLKmode. */
7436 static bool
7437 ix86_member_type_forces_blk (const_tree field, enum machine_mode mode)
7439 /* Union with XFmode must be in BLKmode. */
7440 return (mode == XFmode
7441 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
7442 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
7446 ix86_libcall_value (enum machine_mode mode)
7448 return ix86_function_value_1 (NULL, NULL, mode, mode);
7451 /* Return true iff type is returned in memory. */
7453 static bool ATTRIBUTE_UNUSED
7454 return_in_memory_32 (const_tree type, enum machine_mode mode)
7456 HOST_WIDE_INT size;
7458 if (mode == BLKmode)
7459 return true;
7461 size = int_size_in_bytes (type);
7463 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7464 return false;
7466 if (VECTOR_MODE_P (mode) || mode == TImode)
7468 /* User-created vectors small enough to fit in EAX. */
7469 if (size < 8)
7470 return false;
7472 /* MMX/3dNow values are returned in MM0,
7473 except when it doesn't exits or the ABI prescribes otherwise. */
7474 if (size == 8)
7475 return !TARGET_MMX || TARGET_VECT8_RETURNS;
7477 /* SSE values are returned in XMM0, except when it doesn't exist. */
7478 if (size == 16)
7479 return !TARGET_SSE;
7481 /* AVX values are returned in YMM0, except when it doesn't exist. */
7482 if (size == 32)
7483 return !TARGET_AVX;
7486 if (mode == XFmode)
7487 return false;
7489 if (size > 12)
7490 return true;
7492 /* OImode shouldn't be used directly. */
7493 gcc_assert (mode != OImode);
7495 return false;
7498 static bool ATTRIBUTE_UNUSED
7499 return_in_memory_64 (const_tree type, enum machine_mode mode)
7501 int needed_intregs, needed_sseregs;
7502 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7505 static bool ATTRIBUTE_UNUSED
7506 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7508 HOST_WIDE_INT size = int_size_in_bytes (type);
7510 /* __m128 is returned in xmm0. */
7511 if ((!type || VECTOR_INTEGER_TYPE_P (type) || INTEGRAL_TYPE_P (type)
7512 || VECTOR_FLOAT_TYPE_P (type))
7513 && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7514 && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7515 return false;
7517 /* Otherwise, the size must be exactly in [1248]. */
7518 return size != 1 && size != 2 && size != 4 && size != 8;
7521 static bool
7522 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7524 #ifdef SUBTARGET_RETURN_IN_MEMORY
7525 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7526 #else
7527 const enum machine_mode mode = type_natural_mode (type, NULL);
7529 if (TARGET_64BIT)
7531 if (ix86_function_type_abi (fntype) == MS_ABI)
7532 return return_in_memory_ms_64 (type, mode);
7533 else
7534 return return_in_memory_64 (type, mode);
7536 else
7537 return return_in_memory_32 (type, mode);
7538 #endif
7541 /* When returning SSE vector types, we have a choice of either
7542 (1) being abi incompatible with a -march switch, or
7543 (2) generating an error.
7544 Given no good solution, I think the safest thing is one warning.
7545 The user won't be able to use -Werror, but....
7547 Choose the STRUCT_VALUE_RTX hook because that's (at present) only
7548 called in response to actually generating a caller or callee that
7549 uses such a type. As opposed to TARGET_RETURN_IN_MEMORY, which is called
7550 via aggregate_value_p for general type probing from tree-ssa. */
7552 static rtx
7553 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
7555 static bool warnedsse, warnedmmx;
7557 if (!TARGET_64BIT && type)
7559 /* Look at the return type of the function, not the function type. */
7560 enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
7562 if (!TARGET_SSE && !warnedsse)
7564 if (mode == TImode
7565 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7567 warnedsse = true;
7568 warning (0, "SSE vector return without SSE enabled "
7569 "changes the ABI");
7573 if (!TARGET_MMX && !warnedmmx)
7575 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7577 warnedmmx = true;
7578 warning (0, "MMX vector return without MMX enabled "
7579 "changes the ABI");
7584 return NULL;
7588 /* Create the va_list data type. */
7590 /* Returns the calling convention specific va_list date type.
7591 The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */
7593 static tree
7594 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7596 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7598 /* For i386 we use plain pointer to argument area. */
7599 if (!TARGET_64BIT || abi == MS_ABI)
7600 return build_pointer_type (char_type_node);
7602 record = lang_hooks.types.make_type (RECORD_TYPE);
7603 type_decl = build_decl (BUILTINS_LOCATION,
7604 TYPE_DECL, get_identifier ("__va_list_tag"), record);
7606 f_gpr = build_decl (BUILTINS_LOCATION,
7607 FIELD_DECL, get_identifier ("gp_offset"),
7608 unsigned_type_node);
7609 f_fpr = build_decl (BUILTINS_LOCATION,
7610 FIELD_DECL, get_identifier ("fp_offset"),
7611 unsigned_type_node);
7612 f_ovf = build_decl (BUILTINS_LOCATION,
7613 FIELD_DECL, get_identifier ("overflow_arg_area"),
7614 ptr_type_node);
7615 f_sav = build_decl (BUILTINS_LOCATION,
7616 FIELD_DECL, get_identifier ("reg_save_area"),
7617 ptr_type_node);
7619 va_list_gpr_counter_field = f_gpr;
7620 va_list_fpr_counter_field = f_fpr;
7622 DECL_FIELD_CONTEXT (f_gpr) = record;
7623 DECL_FIELD_CONTEXT (f_fpr) = record;
7624 DECL_FIELD_CONTEXT (f_ovf) = record;
7625 DECL_FIELD_CONTEXT (f_sav) = record;
7627 TYPE_STUB_DECL (record) = type_decl;
7628 TYPE_NAME (record) = type_decl;
7629 TYPE_FIELDS (record) = f_gpr;
7630 DECL_CHAIN (f_gpr) = f_fpr;
7631 DECL_CHAIN (f_fpr) = f_ovf;
7632 DECL_CHAIN (f_ovf) = f_sav;
7634 layout_type (record);
7636 /* The correct type is an array type of one element. */
7637 return build_array_type (record, build_index_type (size_zero_node));
7640 /* Setup the builtin va_list data type and for 64-bit the additional
7641 calling convention specific va_list data types. */
7643 static tree
7644 ix86_build_builtin_va_list (void)
7646 tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
7648 /* Initialize abi specific va_list builtin types. */
7649 if (TARGET_64BIT)
7651 tree t;
7652 if (ix86_abi == MS_ABI)
7654 t = ix86_build_builtin_va_list_abi (SYSV_ABI);
7655 if (TREE_CODE (t) != RECORD_TYPE)
7656 t = build_variant_type_copy (t);
7657 sysv_va_list_type_node = t;
7659 else
7661 t = ret;
7662 if (TREE_CODE (t) != RECORD_TYPE)
7663 t = build_variant_type_copy (t);
7664 sysv_va_list_type_node = t;
7666 if (ix86_abi != MS_ABI)
7668 t = ix86_build_builtin_va_list_abi (MS_ABI);
7669 if (TREE_CODE (t) != RECORD_TYPE)
7670 t = build_variant_type_copy (t);
7671 ms_va_list_type_node = t;
7673 else
7675 t = ret;
7676 if (TREE_CODE (t) != RECORD_TYPE)
7677 t = build_variant_type_copy (t);
7678 ms_va_list_type_node = t;
7682 return ret;
7685 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
7687 static void
7688 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
7690 rtx save_area, mem;
7691 alias_set_type set;
7692 int i, max;
7694 /* GPR size of varargs save area. */
7695 if (cfun->va_list_gpr_size)
7696 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
7697 else
7698 ix86_varargs_gpr_size = 0;
7700 /* FPR size of varargs save area. We don't need it if we don't pass
7701 anything in SSE registers. */
7702 if (TARGET_SSE && cfun->va_list_fpr_size)
7703 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
7704 else
7705 ix86_varargs_fpr_size = 0;
7707 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
7708 return;
7710 save_area = frame_pointer_rtx;
7711 set = get_varargs_alias_set ();
7713 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
7714 if (max > X86_64_REGPARM_MAX)
7715 max = X86_64_REGPARM_MAX;
7717 for (i = cum->regno; i < max; i++)
7719 mem = gen_rtx_MEM (word_mode,
7720 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
7721 MEM_NOTRAP_P (mem) = 1;
7722 set_mem_alias_set (mem, set);
7723 emit_move_insn (mem,
7724 gen_rtx_REG (word_mode,
7725 x86_64_int_parameter_registers[i]));
7728 if (ix86_varargs_fpr_size)
7730 enum machine_mode smode;
7731 rtx label, test;
7733 /* Now emit code to save SSE registers. The AX parameter contains number
7734 of SSE parameter registers used to call this function, though all we
7735 actually check here is the zero/non-zero status. */
7737 label = gen_label_rtx ();
7738 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
7739 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
7740 label));
7742 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
7743 we used movdqa (i.e. TImode) instead? Perhaps even better would
7744 be if we could determine the real mode of the data, via a hook
7745 into pass_stdarg. Ignore all that for now. */
7746 smode = V4SFmode;
7747 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
7748 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
7750 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
7751 if (max > X86_64_SSE_REGPARM_MAX)
7752 max = X86_64_SSE_REGPARM_MAX;
7754 for (i = cum->sse_regno; i < max; ++i)
7756 mem = plus_constant (Pmode, save_area,
7757 i * 16 + ix86_varargs_gpr_size);
7758 mem = gen_rtx_MEM (smode, mem);
7759 MEM_NOTRAP_P (mem) = 1;
7760 set_mem_alias_set (mem, set);
7761 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
7763 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
7766 emit_label (label);
7770 static void
7771 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
7773 alias_set_type set = get_varargs_alias_set ();
7774 int i;
7776 /* Reset to zero, as there might be a sysv vaarg used
7777 before. */
7778 ix86_varargs_gpr_size = 0;
7779 ix86_varargs_fpr_size = 0;
7781 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
7783 rtx reg, mem;
7785 mem = gen_rtx_MEM (Pmode,
7786 plus_constant (Pmode, virtual_incoming_args_rtx,
7787 i * UNITS_PER_WORD));
7788 MEM_NOTRAP_P (mem) = 1;
7789 set_mem_alias_set (mem, set);
7791 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
7792 emit_move_insn (mem, reg);
7796 static void
7797 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
7798 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7799 int no_rtl)
7801 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7802 CUMULATIVE_ARGS next_cum;
7803 tree fntype;
7805 /* This argument doesn't appear to be used anymore. Which is good,
7806 because the old code here didn't suppress rtl generation. */
7807 gcc_assert (!no_rtl);
7809 if (!TARGET_64BIT)
7810 return;
7812 fntype = TREE_TYPE (current_function_decl);
7814 /* For varargs, we do not want to skip the dummy va_dcl argument.
7815 For stdargs, we do want to skip the last named argument. */
7816 next_cum = *cum;
7817 if (stdarg_p (fntype))
7818 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
7819 true);
7821 if (cum->call_abi == MS_ABI)
7822 setup_incoming_varargs_ms_64 (&next_cum);
7823 else
7824 setup_incoming_varargs_64 (&next_cum);
7827 /* Checks if TYPE is of kind va_list char *. */
7829 static bool
7830 is_va_list_char_pointer (tree type)
7832 tree canonic;
7834 /* For 32-bit it is always true. */
7835 if (!TARGET_64BIT)
7836 return true;
7837 canonic = ix86_canonical_va_list_type (type);
7838 return (canonic == ms_va_list_type_node
7839 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
7842 /* Implement va_start. */
7844 static void
7845 ix86_va_start (tree valist, rtx nextarg)
7847 HOST_WIDE_INT words, n_gpr, n_fpr;
7848 tree f_gpr, f_fpr, f_ovf, f_sav;
7849 tree gpr, fpr, ovf, sav, t;
7850 tree type;
7851 rtx ovf_rtx;
7853 if (flag_split_stack
7854 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7856 unsigned int scratch_regno;
7858 /* When we are splitting the stack, we can't refer to the stack
7859 arguments using internal_arg_pointer, because they may be on
7860 the old stack. The split stack prologue will arrange to
7861 leave a pointer to the old stack arguments in a scratch
7862 register, which we here copy to a pseudo-register. The split
7863 stack prologue can't set the pseudo-register directly because
7864 it (the prologue) runs before any registers have been saved. */
7866 scratch_regno = split_stack_prologue_scratch_regno ();
7867 if (scratch_regno != INVALID_REGNUM)
7869 rtx reg, seq;
7871 reg = gen_reg_rtx (Pmode);
7872 cfun->machine->split_stack_varargs_pointer = reg;
7874 start_sequence ();
7875 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
7876 seq = get_insns ();
7877 end_sequence ();
7879 push_topmost_sequence ();
7880 emit_insn_after (seq, entry_of_function ());
7881 pop_topmost_sequence ();
7885 /* Only 64bit target needs something special. */
7886 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7888 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7889 std_expand_builtin_va_start (valist, nextarg);
7890 else
7892 rtx va_r, next;
7894 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
7895 next = expand_binop (ptr_mode, add_optab,
7896 cfun->machine->split_stack_varargs_pointer,
7897 crtl->args.arg_offset_rtx,
7898 NULL_RTX, 0, OPTAB_LIB_WIDEN);
7899 convert_move (va_r, next, 0);
7901 return;
7904 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7905 f_fpr = DECL_CHAIN (f_gpr);
7906 f_ovf = DECL_CHAIN (f_fpr);
7907 f_sav = DECL_CHAIN (f_ovf);
7909 valist = build_simple_mem_ref (valist);
7910 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
7911 /* The following should be folded into the MEM_REF offset. */
7912 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
7913 f_gpr, NULL_TREE);
7914 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
7915 f_fpr, NULL_TREE);
7916 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
7917 f_ovf, NULL_TREE);
7918 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
7919 f_sav, NULL_TREE);
7921 /* Count number of gp and fp argument registers used. */
7922 words = crtl->args.info.words;
7923 n_gpr = crtl->args.info.regno;
7924 n_fpr = crtl->args.info.sse_regno;
7926 if (cfun->va_list_gpr_size)
7928 type = TREE_TYPE (gpr);
7929 t = build2 (MODIFY_EXPR, type,
7930 gpr, build_int_cst (type, n_gpr * 8));
7931 TREE_SIDE_EFFECTS (t) = 1;
7932 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7935 if (TARGET_SSE && cfun->va_list_fpr_size)
7937 type = TREE_TYPE (fpr);
7938 t = build2 (MODIFY_EXPR, type, fpr,
7939 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
7940 TREE_SIDE_EFFECTS (t) = 1;
7941 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7944 /* Find the overflow area. */
7945 type = TREE_TYPE (ovf);
7946 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7947 ovf_rtx = crtl->args.internal_arg_pointer;
7948 else
7949 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
7950 t = make_tree (type, ovf_rtx);
7951 if (words != 0)
7952 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
7953 t = build2 (MODIFY_EXPR, type, ovf, t);
7954 TREE_SIDE_EFFECTS (t) = 1;
7955 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7957 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
7959 /* Find the register save area.
7960 Prologue of the function save it right above stack frame. */
7961 type = TREE_TYPE (sav);
7962 t = make_tree (type, frame_pointer_rtx);
7963 if (!ix86_varargs_gpr_size)
7964 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
7965 t = build2 (MODIFY_EXPR, type, sav, t);
7966 TREE_SIDE_EFFECTS (t) = 1;
7967 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7971 /* Implement va_arg. */
7973 static tree
7974 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
7975 gimple_seq *post_p)
7977 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
7978 tree f_gpr, f_fpr, f_ovf, f_sav;
7979 tree gpr, fpr, ovf, sav, t;
7980 int size, rsize;
7981 tree lab_false, lab_over = NULL_TREE;
7982 tree addr, t2;
7983 rtx container;
7984 int indirect_p = 0;
7985 tree ptrtype;
7986 enum machine_mode nat_mode;
7987 unsigned int arg_boundary;
7989 /* Only 64bit target needs something special. */
7990 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7991 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
7993 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7994 f_fpr = DECL_CHAIN (f_gpr);
7995 f_ovf = DECL_CHAIN (f_fpr);
7996 f_sav = DECL_CHAIN (f_ovf);
7998 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
7999 build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
8000 valist = build_va_arg_indirect_ref (valist);
8001 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
8002 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
8003 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
8005 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
8006 if (indirect_p)
8007 type = build_pointer_type (type);
8008 size = int_size_in_bytes (type);
8009 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
8011 nat_mode = type_natural_mode (type, NULL);
8012 switch (nat_mode)
8014 case V8SFmode:
8015 case V8SImode:
8016 case V32QImode:
8017 case V16HImode:
8018 case V4DFmode:
8019 case V4DImode:
8020 /* Unnamed 256bit vector mode parameters are passed on stack. */
8021 if (!TARGET_64BIT_MS_ABI)
8023 container = NULL;
8024 break;
8027 default:
8028 container = construct_container (nat_mode, TYPE_MODE (type),
8029 type, 0, X86_64_REGPARM_MAX,
8030 X86_64_SSE_REGPARM_MAX, intreg,
8032 break;
8035 /* Pull the value out of the saved registers. */
8037 addr = create_tmp_var (ptr_type_node, "addr");
8039 if (container)
8041 int needed_intregs, needed_sseregs;
8042 bool need_temp;
8043 tree int_addr, sse_addr;
8045 lab_false = create_artificial_label (UNKNOWN_LOCATION);
8046 lab_over = create_artificial_label (UNKNOWN_LOCATION);
8048 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
8050 need_temp = (!REG_P (container)
8051 && ((needed_intregs && TYPE_ALIGN (type) > 64)
8052 || TYPE_ALIGN (type) > 128));
8054 /* In case we are passing structure, verify that it is consecutive block
8055 on the register save area. If not we need to do moves. */
8056 if (!need_temp && !REG_P (container))
8058 /* Verify that all registers are strictly consecutive */
8059 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
8061 int i;
8063 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8065 rtx slot = XVECEXP (container, 0, i);
8066 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
8067 || INTVAL (XEXP (slot, 1)) != i * 16)
8068 need_temp = 1;
8071 else
8073 int i;
8075 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8077 rtx slot = XVECEXP (container, 0, i);
8078 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
8079 || INTVAL (XEXP (slot, 1)) != i * 8)
8080 need_temp = 1;
8084 if (!need_temp)
8086 int_addr = addr;
8087 sse_addr = addr;
8089 else
8091 int_addr = create_tmp_var (ptr_type_node, "int_addr");
8092 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
8095 /* First ensure that we fit completely in registers. */
8096 if (needed_intregs)
8098 t = build_int_cst (TREE_TYPE (gpr),
8099 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
8100 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
8101 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8102 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8103 gimplify_and_add (t, pre_p);
8105 if (needed_sseregs)
8107 t = build_int_cst (TREE_TYPE (fpr),
8108 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
8109 + X86_64_REGPARM_MAX * 8);
8110 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
8111 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8112 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8113 gimplify_and_add (t, pre_p);
8116 /* Compute index to start of area used for integer regs. */
8117 if (needed_intregs)
8119 /* int_addr = gpr + sav; */
8120 t = fold_build_pointer_plus (sav, gpr);
8121 gimplify_assign (int_addr, t, pre_p);
8123 if (needed_sseregs)
8125 /* sse_addr = fpr + sav; */
8126 t = fold_build_pointer_plus (sav, fpr);
8127 gimplify_assign (sse_addr, t, pre_p);
8129 if (need_temp)
8131 int i, prev_size = 0;
8132 tree temp = create_tmp_var (type, "va_arg_tmp");
8134 /* addr = &temp; */
8135 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
8136 gimplify_assign (addr, t, pre_p);
8138 for (i = 0; i < XVECLEN (container, 0); i++)
8140 rtx slot = XVECEXP (container, 0, i);
8141 rtx reg = XEXP (slot, 0);
8142 enum machine_mode mode = GET_MODE (reg);
8143 tree piece_type;
8144 tree addr_type;
8145 tree daddr_type;
8146 tree src_addr, src;
8147 int src_offset;
8148 tree dest_addr, dest;
8149 int cur_size = GET_MODE_SIZE (mode);
8151 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
8152 prev_size = INTVAL (XEXP (slot, 1));
8153 if (prev_size + cur_size > size)
8155 cur_size = size - prev_size;
8156 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
8157 if (mode == BLKmode)
8158 mode = QImode;
8160 piece_type = lang_hooks.types.type_for_mode (mode, 1);
8161 if (mode == GET_MODE (reg))
8162 addr_type = build_pointer_type (piece_type);
8163 else
8164 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8165 true);
8166 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8167 true);
8169 if (SSE_REGNO_P (REGNO (reg)))
8171 src_addr = sse_addr;
8172 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
8174 else
8176 src_addr = int_addr;
8177 src_offset = REGNO (reg) * 8;
8179 src_addr = fold_convert (addr_type, src_addr);
8180 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
8182 dest_addr = fold_convert (daddr_type, addr);
8183 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
8184 if (cur_size == GET_MODE_SIZE (mode))
8186 src = build_va_arg_indirect_ref (src_addr);
8187 dest = build_va_arg_indirect_ref (dest_addr);
8189 gimplify_assign (dest, src, pre_p);
8191 else
8193 tree copy
8194 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
8195 3, dest_addr, src_addr,
8196 size_int (cur_size));
8197 gimplify_and_add (copy, pre_p);
8199 prev_size += cur_size;
8203 if (needed_intregs)
8205 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
8206 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
8207 gimplify_assign (gpr, t, pre_p);
8210 if (needed_sseregs)
8212 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
8213 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
8214 gimplify_assign (fpr, t, pre_p);
8217 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
8219 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
8222 /* ... otherwise out of the overflow area. */
8224 /* When we align parameter on stack for caller, if the parameter
8225 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
8226 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
8227 here with caller. */
8228 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
8229 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
8230 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
8232 /* Care for on-stack alignment if needed. */
8233 if (arg_boundary <= 64 || size == 0)
8234 t = ovf;
8235 else
8237 HOST_WIDE_INT align = arg_boundary / 8;
8238 t = fold_build_pointer_plus_hwi (ovf, align - 1);
8239 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8240 build_int_cst (TREE_TYPE (t), -align));
8243 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
8244 gimplify_assign (addr, t, pre_p);
8246 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
8247 gimplify_assign (unshare_expr (ovf), t, pre_p);
8249 if (container)
8250 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
8252 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
8253 addr = fold_convert (ptrtype, addr);
8255 if (indirect_p)
8256 addr = build_va_arg_indirect_ref (addr);
8257 return build_va_arg_indirect_ref (addr);
8260 /* Return true if OPNUM's MEM should be matched
8261 in movabs* patterns. */
8263 bool
8264 ix86_check_movabs (rtx insn, int opnum)
8266 rtx set, mem;
8268 set = PATTERN (insn);
8269 if (GET_CODE (set) == PARALLEL)
8270 set = XVECEXP (set, 0, 0);
8271 gcc_assert (GET_CODE (set) == SET);
8272 mem = XEXP (set, opnum);
8273 while (GET_CODE (mem) == SUBREG)
8274 mem = SUBREG_REG (mem);
8275 gcc_assert (MEM_P (mem));
8276 return volatile_ok || !MEM_VOLATILE_P (mem);
8279 /* Initialize the table of extra 80387 mathematical constants. */
8281 static void
8282 init_ext_80387_constants (void)
8284 static const char * cst[5] =
8286 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
8287 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
8288 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
8289 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
8290 "3.1415926535897932385128089594061862044", /* 4: fldpi */
8292 int i;
8294 for (i = 0; i < 5; i++)
8296 real_from_string (&ext_80387_constants_table[i], cst[i]);
8297 /* Ensure each constant is rounded to XFmode precision. */
8298 real_convert (&ext_80387_constants_table[i],
8299 XFmode, &ext_80387_constants_table[i]);
8302 ext_80387_constants_init = 1;
8305 /* Return non-zero if the constant is something that
8306 can be loaded with a special instruction. */
8309 standard_80387_constant_p (rtx x)
8311 enum machine_mode mode = GET_MODE (x);
8313 REAL_VALUE_TYPE r;
8315 if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8316 return -1;
8318 if (x == CONST0_RTX (mode))
8319 return 1;
8320 if (x == CONST1_RTX (mode))
8321 return 2;
8323 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8325 /* For XFmode constants, try to find a special 80387 instruction when
8326 optimizing for size or on those CPUs that benefit from them. */
8327 if (mode == XFmode
8328 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8330 int i;
8332 if (! ext_80387_constants_init)
8333 init_ext_80387_constants ();
8335 for (i = 0; i < 5; i++)
8336 if (real_identical (&r, &ext_80387_constants_table[i]))
8337 return i + 3;
8340 /* Load of the constant -0.0 or -1.0 will be split as
8341 fldz;fchs or fld1;fchs sequence. */
8342 if (real_isnegzero (&r))
8343 return 8;
8344 if (real_identical (&r, &dconstm1))
8345 return 9;
8347 return 0;
8350 /* Return the opcode of the special instruction to be used to load
8351 the constant X. */
8353 const char *
8354 standard_80387_constant_opcode (rtx x)
8356 switch (standard_80387_constant_p (x))
8358 case 1:
8359 return "fldz";
8360 case 2:
8361 return "fld1";
8362 case 3:
8363 return "fldlg2";
8364 case 4:
8365 return "fldln2";
8366 case 5:
8367 return "fldl2e";
8368 case 6:
8369 return "fldl2t";
8370 case 7:
8371 return "fldpi";
8372 case 8:
8373 case 9:
8374 return "#";
8375 default:
8376 gcc_unreachable ();
8380 /* Return the CONST_DOUBLE representing the 80387 constant that is
8381 loaded by the specified special instruction. The argument IDX
8382 matches the return value from standard_80387_constant_p. */
8385 standard_80387_constant_rtx (int idx)
8387 int i;
8389 if (! ext_80387_constants_init)
8390 init_ext_80387_constants ();
8392 switch (idx)
8394 case 3:
8395 case 4:
8396 case 5:
8397 case 6:
8398 case 7:
8399 i = idx - 3;
8400 break;
8402 default:
8403 gcc_unreachable ();
8406 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8407 XFmode);
8410 /* Return 1 if X is all 0s and 2 if x is all 1s
8411 in supported SSE/AVX vector mode. */
8414 standard_sse_constant_p (rtx x)
8416 enum machine_mode mode = GET_MODE (x);
8418 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8419 return 1;
8420 if (vector_all_ones_operand (x, mode))
8421 switch (mode)
8423 case V16QImode:
8424 case V8HImode:
8425 case V4SImode:
8426 case V2DImode:
8427 if (TARGET_SSE2)
8428 return 2;
8429 case V32QImode:
8430 case V16HImode:
8431 case V8SImode:
8432 case V4DImode:
8433 if (TARGET_AVX2)
8434 return 2;
8435 default:
8436 break;
8439 return 0;
8442 /* Return the opcode of the special instruction to be used to load
8443 the constant X. */
8445 const char *
8446 standard_sse_constant_opcode (rtx insn, rtx x)
8448 switch (standard_sse_constant_p (x))
8450 case 1:
8451 switch (get_attr_mode (insn))
8453 case MODE_TI:
8454 return "%vpxor\t%0, %d0";
8455 case MODE_V2DF:
8456 return "%vxorpd\t%0, %d0";
8457 case MODE_V4SF:
8458 return "%vxorps\t%0, %d0";
8460 case MODE_OI:
8461 return "vpxor\t%x0, %x0, %x0";
8462 case MODE_V4DF:
8463 return "vxorpd\t%x0, %x0, %x0";
8464 case MODE_V8SF:
8465 return "vxorps\t%x0, %x0, %x0";
8467 default:
8468 break;
8471 case 2:
8472 if (TARGET_AVX)
8473 return "vpcmpeqd\t%0, %0, %0";
8474 else
8475 return "pcmpeqd\t%0, %0";
8477 default:
8478 break;
8480 gcc_unreachable ();
8483 /* Returns true if OP contains a symbol reference */
8485 bool
8486 symbolic_reference_mentioned_p (rtx op)
8488 const char *fmt;
8489 int i;
8491 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8492 return true;
8494 fmt = GET_RTX_FORMAT (GET_CODE (op));
8495 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8497 if (fmt[i] == 'E')
8499 int j;
8501 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8502 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8503 return true;
8506 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8507 return true;
8510 return false;
8513 /* Return true if it is appropriate to emit `ret' instructions in the
8514 body of a function. Do this only if the epilogue is simple, needing a
8515 couple of insns. Prior to reloading, we can't tell how many registers
8516 must be saved, so return false then. Return false if there is no frame
8517 marker to de-allocate. */
8519 bool
8520 ix86_can_use_return_insn_p (void)
8522 struct ix86_frame frame;
8524 if (! reload_completed || frame_pointer_needed)
8525 return 0;
8527 /* Don't allow more than 32k pop, since that's all we can do
8528 with one instruction. */
8529 if (crtl->args.pops_args && crtl->args.size >= 32768)
8530 return 0;
8532 ix86_compute_frame_layout (&frame);
8533 return (frame.stack_pointer_offset == UNITS_PER_WORD
8534 && (frame.nregs + frame.nsseregs) == 0);
8537 /* Value should be nonzero if functions must have frame pointers.
8538 Zero means the frame pointer need not be set up (and parms may
8539 be accessed via the stack pointer) in functions that seem suitable. */
8541 static bool
8542 ix86_frame_pointer_required (void)
8544 /* If we accessed previous frames, then the generated code expects
8545 to be able to access the saved ebp value in our frame. */
8546 if (cfun->machine->accesses_prev_frame)
8547 return true;
8549 /* Several x86 os'es need a frame pointer for other reasons,
8550 usually pertaining to setjmp. */
8551 if (SUBTARGET_FRAME_POINTER_REQUIRED)
8552 return true;
8554 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
8555 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8556 return true;
8558 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
8559 allocation is 4GB. */
8560 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
8561 return true;
8563 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8564 turns off the frame pointer by default. Turn it back on now if
8565 we've not got a leaf function. */
8566 if (TARGET_OMIT_LEAF_FRAME_POINTER
8567 && (!crtl->is_leaf
8568 || ix86_current_function_calls_tls_descriptor))
8569 return true;
8571 if (crtl->profile && !flag_fentry)
8572 return true;
8574 return false;
8577 /* Record that the current function accesses previous call frames. */
8579 void
8580 ix86_setup_frame_addresses (void)
8582 cfun->machine->accesses_prev_frame = 1;
8585 #ifndef USE_HIDDEN_LINKONCE
8586 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8587 # define USE_HIDDEN_LINKONCE 1
8588 # else
8589 # define USE_HIDDEN_LINKONCE 0
8590 # endif
8591 #endif
8593 static int pic_labels_used;
8595 /* Fills in the label name that should be used for a pc thunk for
8596 the given register. */
8598 static void
8599 get_pc_thunk_name (char name[32], unsigned int regno)
8601 gcc_assert (!TARGET_64BIT);
8603 if (USE_HIDDEN_LINKONCE)
8604 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
8605 else
8606 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8610 /* This function generates code for -fpic that loads %ebx with
8611 the return address of the caller and then returns. */
8613 static void
8614 ix86_code_end (void)
8616 rtx xops[2];
8617 int regno;
8619 for (regno = AX_REG; regno <= SP_REG; regno++)
8621 char name[32];
8622 tree decl;
8624 if (!(pic_labels_used & (1 << regno)))
8625 continue;
8627 get_pc_thunk_name (name, regno);
8629 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
8630 get_identifier (name),
8631 build_function_type_list (void_type_node, NULL_TREE));
8632 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
8633 NULL_TREE, void_type_node);
8634 TREE_PUBLIC (decl) = 1;
8635 TREE_STATIC (decl) = 1;
8636 DECL_IGNORED_P (decl) = 1;
8638 #if TARGET_MACHO
8639 if (TARGET_MACHO)
8641 switch_to_section (darwin_sections[text_coal_section]);
8642 fputs ("\t.weak_definition\t", asm_out_file);
8643 assemble_name (asm_out_file, name);
8644 fputs ("\n\t.private_extern\t", asm_out_file);
8645 assemble_name (asm_out_file, name);
8646 putc ('\n', asm_out_file);
8647 ASM_OUTPUT_LABEL (asm_out_file, name);
8648 DECL_WEAK (decl) = 1;
8650 else
8651 #endif
8652 if (USE_HIDDEN_LINKONCE)
8654 DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
8656 targetm.asm_out.unique_section (decl, 0);
8657 switch_to_section (get_named_section (decl, NULL, 0));
8659 targetm.asm_out.globalize_label (asm_out_file, name);
8660 fputs ("\t.hidden\t", asm_out_file);
8661 assemble_name (asm_out_file, name);
8662 putc ('\n', asm_out_file);
8663 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
8665 else
8667 switch_to_section (text_section);
8668 ASM_OUTPUT_LABEL (asm_out_file, name);
8671 DECL_INITIAL (decl) = make_node (BLOCK);
8672 current_function_decl = decl;
8673 init_function_start (decl);
8674 first_function_block_is_cold = false;
8675 /* Make sure unwind info is emitted for the thunk if needed. */
8676 final_start_function (emit_barrier (), asm_out_file, 1);
8678 /* Pad stack IP move with 4 instructions (two NOPs count
8679 as one instruction). */
8680 if (TARGET_PAD_SHORT_FUNCTION)
8682 int i = 8;
8684 while (i--)
8685 fputs ("\tnop\n", asm_out_file);
8688 xops[0] = gen_rtx_REG (Pmode, regno);
8689 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
8690 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
8691 fputs ("\tret\n", asm_out_file);
8692 final_end_function ();
8693 init_insn_lengths ();
8694 free_after_compilation (cfun);
8695 set_cfun (NULL);
8696 current_function_decl = NULL;
8699 if (flag_split_stack)
8700 file_end_indicate_split_stack ();
8703 /* Emit code for the SET_GOT patterns. */
8705 const char *
8706 output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
8708 rtx xops[3];
8710 xops[0] = dest;
8712 if (TARGET_VXWORKS_RTP && flag_pic)
8714 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
8715 xops[2] = gen_rtx_MEM (Pmode,
8716 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
8717 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
8719 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
8720 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
8721 an unadorned address. */
8722 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
8723 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
8724 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
8725 return "";
8728 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
8730 if (!flag_pic)
8732 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
8734 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
8736 #if TARGET_MACHO
8737 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8738 is what will be referenced by the Mach-O PIC subsystem. */
8739 if (!label)
8740 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8741 #endif
8743 targetm.asm_out.internal_label (asm_out_file, "L",
8744 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
8746 else
8748 char name[32];
8749 get_pc_thunk_name (name, REGNO (dest));
8750 pic_labels_used |= 1 << REGNO (dest);
8752 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
8753 xops[2] = gen_rtx_MEM (QImode, xops[2]);
8754 output_asm_insn ("call\t%X2", xops);
8755 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8756 is what will be referenced by the Mach-O PIC subsystem. */
8757 #if TARGET_MACHO
8758 if (!label)
8759 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8760 else
8761 targetm.asm_out.internal_label (asm_out_file, "L",
8762 CODE_LABEL_NUMBER (label));
8763 #endif
8766 if (!TARGET_MACHO)
8767 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
8769 return "";
8772 /* Generate an "push" pattern for input ARG. */
8774 static rtx
8775 gen_push (rtx arg)
8777 struct machine_function *m = cfun->machine;
8779 if (m->fs.cfa_reg == stack_pointer_rtx)
8780 m->fs.cfa_offset += UNITS_PER_WORD;
8781 m->fs.sp_offset += UNITS_PER_WORD;
8783 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8784 arg = gen_rtx_REG (word_mode, REGNO (arg));
8786 return gen_rtx_SET (VOIDmode,
8787 gen_rtx_MEM (word_mode,
8788 gen_rtx_PRE_DEC (Pmode,
8789 stack_pointer_rtx)),
8790 arg);
8793 /* Generate an "pop" pattern for input ARG. */
8795 static rtx
8796 gen_pop (rtx arg)
8798 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8799 arg = gen_rtx_REG (word_mode, REGNO (arg));
8801 return gen_rtx_SET (VOIDmode,
8802 arg,
8803 gen_rtx_MEM (word_mode,
8804 gen_rtx_POST_INC (Pmode,
8805 stack_pointer_rtx)));
8808 /* Return >= 0 if there is an unused call-clobbered register available
8809 for the entire function. */
8811 static unsigned int
8812 ix86_select_alt_pic_regnum (void)
8814 if (crtl->is_leaf
8815 && !crtl->profile
8816 && !ix86_current_function_calls_tls_descriptor)
8818 int i, drap;
8819 /* Can't use the same register for both PIC and DRAP. */
8820 if (crtl->drap_reg)
8821 drap = REGNO (crtl->drap_reg);
8822 else
8823 drap = -1;
8824 for (i = 2; i >= 0; --i)
8825 if (i != drap && !df_regs_ever_live_p (i))
8826 return i;
8829 return INVALID_REGNUM;
8832 /* Return TRUE if we need to save REGNO. */
8834 static bool
8835 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
8837 if (pic_offset_table_rtx
8838 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
8839 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
8840 || crtl->profile
8841 || crtl->calls_eh_return
8842 || crtl->uses_const_pool))
8843 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
8845 if (crtl->calls_eh_return && maybe_eh_return)
8847 unsigned i;
8848 for (i = 0; ; i++)
8850 unsigned test = EH_RETURN_DATA_REGNO (i);
8851 if (test == INVALID_REGNUM)
8852 break;
8853 if (test == regno)
8854 return true;
8858 if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
8859 return true;
8861 return (df_regs_ever_live_p (regno)
8862 && !call_used_regs[regno]
8863 && !fixed_regs[regno]
8864 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
8867 /* Return number of saved general prupose registers. */
8869 static int
8870 ix86_nsaved_regs (void)
8872 int nregs = 0;
8873 int regno;
8875 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8876 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8877 nregs ++;
8878 return nregs;
8881 /* Return number of saved SSE registrers. */
8883 static int
8884 ix86_nsaved_sseregs (void)
8886 int nregs = 0;
8887 int regno;
8889 if (!TARGET_64BIT_MS_ABI)
8890 return 0;
8891 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8892 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8893 nregs ++;
8894 return nregs;
8897 /* Given FROM and TO register numbers, say whether this elimination is
8898 allowed. If stack alignment is needed, we can only replace argument
8899 pointer with hard frame pointer, or replace frame pointer with stack
8900 pointer. Otherwise, frame pointer elimination is automatically
8901 handled and all other eliminations are valid. */
8903 static bool
8904 ix86_can_eliminate (const int from, const int to)
8906 if (stack_realign_fp)
8907 return ((from == ARG_POINTER_REGNUM
8908 && to == HARD_FRAME_POINTER_REGNUM)
8909 || (from == FRAME_POINTER_REGNUM
8910 && to == STACK_POINTER_REGNUM));
8911 else
8912 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
8915 /* Return the offset between two registers, one to be eliminated, and the other
8916 its replacement, at the start of a routine. */
8918 HOST_WIDE_INT
8919 ix86_initial_elimination_offset (int from, int to)
8921 struct ix86_frame frame;
8922 ix86_compute_frame_layout (&frame);
8924 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
8925 return frame.hard_frame_pointer_offset;
8926 else if (from == FRAME_POINTER_REGNUM
8927 && to == HARD_FRAME_POINTER_REGNUM)
8928 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
8929 else
8931 gcc_assert (to == STACK_POINTER_REGNUM);
8933 if (from == ARG_POINTER_REGNUM)
8934 return frame.stack_pointer_offset;
8936 gcc_assert (from == FRAME_POINTER_REGNUM);
8937 return frame.stack_pointer_offset - frame.frame_pointer_offset;
8941 /* In a dynamically-aligned function, we can't know the offset from
8942 stack pointer to frame pointer, so we must ensure that setjmp
8943 eliminates fp against the hard fp (%ebp) rather than trying to
8944 index from %esp up to the top of the frame across a gap that is
8945 of unknown (at compile-time) size. */
8946 static rtx
8947 ix86_builtin_setjmp_frame_value (void)
8949 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
8952 /* When using -fsplit-stack, the allocation routines set a field in
8953 the TCB to the bottom of the stack plus this much space, measured
8954 in bytes. */
8956 #define SPLIT_STACK_AVAILABLE 256
8958 /* Fill structure ix86_frame about frame of currently computed function. */
8960 static void
8961 ix86_compute_frame_layout (struct ix86_frame *frame)
8963 unsigned HOST_WIDE_INT stack_alignment_needed;
8964 HOST_WIDE_INT offset;
8965 unsigned HOST_WIDE_INT preferred_alignment;
8966 HOST_WIDE_INT size = get_frame_size ();
8967 HOST_WIDE_INT to_allocate;
8969 frame->nregs = ix86_nsaved_regs ();
8970 frame->nsseregs = ix86_nsaved_sseregs ();
8972 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
8973 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
8975 /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
8976 function prologues and leaf. */
8977 if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
8978 && (!crtl->is_leaf || cfun->calls_alloca != 0
8979 || ix86_current_function_calls_tls_descriptor))
8981 preferred_alignment = 16;
8982 stack_alignment_needed = 16;
8983 crtl->preferred_stack_boundary = 128;
8984 crtl->stack_alignment_needed = 128;
8987 gcc_assert (!size || stack_alignment_needed);
8988 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
8989 gcc_assert (preferred_alignment <= stack_alignment_needed);
8991 /* For SEH we have to limit the amount of code movement into the prologue.
8992 At present we do this via a BLOCKAGE, at which point there's very little
8993 scheduling that can be done, which means that there's very little point
8994 in doing anything except PUSHs. */
8995 if (TARGET_SEH)
8996 cfun->machine->use_fast_prologue_epilogue = false;
8998 /* During reload iteration the amount of registers saved can change.
8999 Recompute the value as needed. Do not recompute when amount of registers
9000 didn't change as reload does multiple calls to the function and does not
9001 expect the decision to change within single iteration. */
9002 else if (!optimize_function_for_size_p (cfun)
9003 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
9005 int count = frame->nregs;
9006 struct cgraph_node *node = cgraph_get_node (current_function_decl);
9008 cfun->machine->use_fast_prologue_epilogue_nregs = count;
9010 /* The fast prologue uses move instead of push to save registers. This
9011 is significantly longer, but also executes faster as modern hardware
9012 can execute the moves in parallel, but can't do that for push/pop.
9014 Be careful about choosing what prologue to emit: When function takes
9015 many instructions to execute we may use slow version as well as in
9016 case function is known to be outside hot spot (this is known with
9017 feedback only). Weight the size of function by number of registers
9018 to save as it is cheap to use one or two push instructions but very
9019 slow to use many of them. */
9020 if (count)
9021 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
9022 if (node->frequency < NODE_FREQUENCY_NORMAL
9023 || (flag_branch_probabilities
9024 && node->frequency < NODE_FREQUENCY_HOT))
9025 cfun->machine->use_fast_prologue_epilogue = false;
9026 else
9027 cfun->machine->use_fast_prologue_epilogue
9028 = !expensive_function_p (count);
9031 frame->save_regs_using_mov
9032 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
9033 /* If static stack checking is enabled and done with probes,
9034 the registers need to be saved before allocating the frame. */
9035 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
9037 /* Skip return address. */
9038 offset = UNITS_PER_WORD;
9040 /* Skip pushed static chain. */
9041 if (ix86_static_chain_on_stack)
9042 offset += UNITS_PER_WORD;
9044 /* Skip saved base pointer. */
9045 if (frame_pointer_needed)
9046 offset += UNITS_PER_WORD;
9047 frame->hfp_save_offset = offset;
9049 /* The traditional frame pointer location is at the top of the frame. */
9050 frame->hard_frame_pointer_offset = offset;
9052 /* Register save area */
9053 offset += frame->nregs * UNITS_PER_WORD;
9054 frame->reg_save_offset = offset;
9056 /* On SEH target, registers are pushed just before the frame pointer
9057 location. */
9058 if (TARGET_SEH)
9059 frame->hard_frame_pointer_offset = offset;
9061 /* Align and set SSE register save area. */
9062 if (frame->nsseregs)
9064 /* The only ABI that has saved SSE registers (Win64) also has a
9065 16-byte aligned default stack, and thus we don't need to be
9066 within the re-aligned local stack frame to save them. */
9067 gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
9068 offset = (offset + 16 - 1) & -16;
9069 offset += frame->nsseregs * 16;
9071 frame->sse_reg_save_offset = offset;
9073 /* The re-aligned stack starts here. Values before this point are not
9074 directly comparable with values below this point. In order to make
9075 sure that no value happens to be the same before and after, force
9076 the alignment computation below to add a non-zero value. */
9077 if (stack_realign_fp)
9078 offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
9080 /* Va-arg area */
9081 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
9082 offset += frame->va_arg_size;
9084 /* Align start of frame for local function. */
9085 if (stack_realign_fp
9086 || offset != frame->sse_reg_save_offset
9087 || size != 0
9088 || !crtl->is_leaf
9089 || cfun->calls_alloca
9090 || ix86_current_function_calls_tls_descriptor)
9091 offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
9093 /* Frame pointer points here. */
9094 frame->frame_pointer_offset = offset;
9096 offset += size;
9098 /* Add outgoing arguments area. Can be skipped if we eliminated
9099 all the function calls as dead code.
9100 Skipping is however impossible when function calls alloca. Alloca
9101 expander assumes that last crtl->outgoing_args_size
9102 of stack frame are unused. */
9103 if (ACCUMULATE_OUTGOING_ARGS
9104 && (!crtl->is_leaf || cfun->calls_alloca
9105 || ix86_current_function_calls_tls_descriptor))
9107 offset += crtl->outgoing_args_size;
9108 frame->outgoing_arguments_size = crtl->outgoing_args_size;
9110 else
9111 frame->outgoing_arguments_size = 0;
9113 /* Align stack boundary. Only needed if we're calling another function
9114 or using alloca. */
9115 if (!crtl->is_leaf || cfun->calls_alloca
9116 || ix86_current_function_calls_tls_descriptor)
9117 offset = (offset + preferred_alignment - 1) & -preferred_alignment;
9119 /* We've reached end of stack frame. */
9120 frame->stack_pointer_offset = offset;
9122 /* Size prologue needs to allocate. */
9123 to_allocate = offset - frame->sse_reg_save_offset;
9125 if ((!to_allocate && frame->nregs <= 1)
9126 || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
9127 frame->save_regs_using_mov = false;
9129 if (ix86_using_red_zone ()
9130 && crtl->sp_is_unchanging
9131 && crtl->is_leaf
9132 && !ix86_current_function_calls_tls_descriptor)
9134 frame->red_zone_size = to_allocate;
9135 if (frame->save_regs_using_mov)
9136 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
9137 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
9138 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
9140 else
9141 frame->red_zone_size = 0;
9142 frame->stack_pointer_offset -= frame->red_zone_size;
9144 /* The SEH frame pointer location is near the bottom of the frame.
9145 This is enforced by the fact that the difference between the
9146 stack pointer and the frame pointer is limited to 240 bytes in
9147 the unwind data structure. */
9148 if (TARGET_SEH)
9150 HOST_WIDE_INT diff;
9152 /* If we can leave the frame pointer where it is, do so. Also, returns
9153 the establisher frame for __builtin_frame_address (0). */
9154 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
9155 if (diff <= SEH_MAX_FRAME_SIZE
9156 && (diff > 240 || (diff & 15) != 0)
9157 && !crtl->accesses_prior_frames)
9159 /* Ideally we'd determine what portion of the local stack frame
9160 (within the constraint of the lowest 240) is most heavily used.
9161 But without that complication, simply bias the frame pointer
9162 by 128 bytes so as to maximize the amount of the local stack
9163 frame that is addressable with 8-bit offsets. */
9164 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
9169 /* This is semi-inlined memory_address_length, but simplified
9170 since we know that we're always dealing with reg+offset, and
9171 to avoid having to create and discard all that rtl. */
9173 static inline int
9174 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
9176 int len = 4;
9178 if (offset == 0)
9180 /* EBP and R13 cannot be encoded without an offset. */
9181 len = (regno == BP_REG || regno == R13_REG);
9183 else if (IN_RANGE (offset, -128, 127))
9184 len = 1;
9186 /* ESP and R12 must be encoded with a SIB byte. */
9187 if (regno == SP_REG || regno == R12_REG)
9188 len++;
9190 return len;
9193 /* Return an RTX that points to CFA_OFFSET within the stack frame.
9194 The valid base registers are taken from CFUN->MACHINE->FS. */
9196 static rtx
9197 choose_baseaddr (HOST_WIDE_INT cfa_offset)
9199 const struct machine_function *m = cfun->machine;
9200 rtx base_reg = NULL;
9201 HOST_WIDE_INT base_offset = 0;
9203 if (m->use_fast_prologue_epilogue)
9205 /* Choose the base register most likely to allow the most scheduling
9206 opportunities. Generally FP is valid throughout the function,
9207 while DRAP must be reloaded within the epilogue. But choose either
9208 over the SP due to increased encoding size. */
9210 if (m->fs.fp_valid)
9212 base_reg = hard_frame_pointer_rtx;
9213 base_offset = m->fs.fp_offset - cfa_offset;
9215 else if (m->fs.drap_valid)
9217 base_reg = crtl->drap_reg;
9218 base_offset = 0 - cfa_offset;
9220 else if (m->fs.sp_valid)
9222 base_reg = stack_pointer_rtx;
9223 base_offset = m->fs.sp_offset - cfa_offset;
9226 else
9228 HOST_WIDE_INT toffset;
9229 int len = 16, tlen;
9231 /* Choose the base register with the smallest address encoding.
9232 With a tie, choose FP > DRAP > SP. */
9233 if (m->fs.sp_valid)
9235 base_reg = stack_pointer_rtx;
9236 base_offset = m->fs.sp_offset - cfa_offset;
9237 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
9239 if (m->fs.drap_valid)
9241 toffset = 0 - cfa_offset;
9242 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
9243 if (tlen <= len)
9245 base_reg = crtl->drap_reg;
9246 base_offset = toffset;
9247 len = tlen;
9250 if (m->fs.fp_valid)
9252 toffset = m->fs.fp_offset - cfa_offset;
9253 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
9254 if (tlen <= len)
9256 base_reg = hard_frame_pointer_rtx;
9257 base_offset = toffset;
9258 len = tlen;
9262 gcc_assert (base_reg != NULL);
9264 return plus_constant (Pmode, base_reg, base_offset);
9267 /* Emit code to save registers in the prologue. */
9269 static void
9270 ix86_emit_save_regs (void)
9272 unsigned int regno;
9273 rtx insn;
9275 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9276 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9278 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
9279 RTX_FRAME_RELATED_P (insn) = 1;
9283 /* Emit a single register save at CFA - CFA_OFFSET. */
9285 static void
9286 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9287 HOST_WIDE_INT cfa_offset)
9289 struct machine_function *m = cfun->machine;
9290 rtx reg = gen_rtx_REG (mode, regno);
9291 rtx mem, addr, base, insn;
9293 addr = choose_baseaddr (cfa_offset);
9294 mem = gen_frame_mem (mode, addr);
9296 /* For SSE saves, we need to indicate the 128-bit alignment. */
9297 set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9299 insn = emit_move_insn (mem, reg);
9300 RTX_FRAME_RELATED_P (insn) = 1;
9302 base = addr;
9303 if (GET_CODE (base) == PLUS)
9304 base = XEXP (base, 0);
9305 gcc_checking_assert (REG_P (base));
9307 /* When saving registers into a re-aligned local stack frame, avoid
9308 any tricky guessing by dwarf2out. */
9309 if (m->fs.realigned)
9311 gcc_checking_assert (stack_realign_drap);
9313 if (regno == REGNO (crtl->drap_reg))
9315 /* A bit of a hack. We force the DRAP register to be saved in
9316 the re-aligned stack frame, which provides us with a copy
9317 of the CFA that will last past the prologue. Install it. */
9318 gcc_checking_assert (cfun->machine->fs.fp_valid);
9319 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9320 cfun->machine->fs.fp_offset - cfa_offset);
9321 mem = gen_rtx_MEM (mode, addr);
9322 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9324 else
9326 /* The frame pointer is a stable reference within the
9327 aligned frame. Use it. */
9328 gcc_checking_assert (cfun->machine->fs.fp_valid);
9329 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9330 cfun->machine->fs.fp_offset - cfa_offset);
9331 mem = gen_rtx_MEM (mode, addr);
9332 add_reg_note (insn, REG_CFA_EXPRESSION,
9333 gen_rtx_SET (VOIDmode, mem, reg));
9337 /* The memory may not be relative to the current CFA register,
9338 which means that we may need to generate a new pattern for
9339 use by the unwind info. */
9340 else if (base != m->fs.cfa_reg)
9342 addr = plus_constant (Pmode, m->fs.cfa_reg,
9343 m->fs.cfa_offset - cfa_offset);
9344 mem = gen_rtx_MEM (mode, addr);
9345 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9349 /* Emit code to save registers using MOV insns.
9350 First register is stored at CFA - CFA_OFFSET. */
9351 static void
9352 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9354 unsigned int regno;
9356 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9357 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9359 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
9360 cfa_offset -= UNITS_PER_WORD;
9364 /* Emit code to save SSE registers using MOV insns.
9365 First register is stored at CFA - CFA_OFFSET. */
9366 static void
9367 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9369 unsigned int regno;
9371 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9372 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9374 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9375 cfa_offset -= 16;
9379 static GTY(()) rtx queued_cfa_restores;
9381 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9382 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
9383 Don't add the note if the previously saved value will be left untouched
9384 within stack red-zone till return, as unwinders can find the same value
9385 in the register and on the stack. */
9387 static void
9388 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9390 if (!crtl->shrink_wrapped
9391 && cfa_offset <= cfun->machine->fs.red_zone_offset)
9392 return;
9394 if (insn)
9396 add_reg_note (insn, REG_CFA_RESTORE, reg);
9397 RTX_FRAME_RELATED_P (insn) = 1;
9399 else
9400 queued_cfa_restores
9401 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9404 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
9406 static void
9407 ix86_add_queued_cfa_restore_notes (rtx insn)
9409 rtx last;
9410 if (!queued_cfa_restores)
9411 return;
9412 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9414 XEXP (last, 1) = REG_NOTES (insn);
9415 REG_NOTES (insn) = queued_cfa_restores;
9416 queued_cfa_restores = NULL_RTX;
9417 RTX_FRAME_RELATED_P (insn) = 1;
9420 /* Expand prologue or epilogue stack adjustment.
9421 The pattern exist to put a dependency on all ebp-based memory accesses.
9422 STYLE should be negative if instructions should be marked as frame related,
9423 zero if %r11 register is live and cannot be freely used and positive
9424 otherwise. */
9426 static void
9427 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9428 int style, bool set_cfa)
9430 struct machine_function *m = cfun->machine;
9431 rtx insn;
9432 bool add_frame_related_expr = false;
9434 if (Pmode == SImode)
9435 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9436 else if (x86_64_immediate_operand (offset, DImode))
9437 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9438 else
9440 rtx tmp;
9441 /* r11 is used by indirect sibcall return as well, set before the
9442 epilogue and used after the epilogue. */
9443 if (style)
9444 tmp = gen_rtx_REG (DImode, R11_REG);
9445 else
9447 gcc_assert (src != hard_frame_pointer_rtx
9448 && dest != hard_frame_pointer_rtx);
9449 tmp = hard_frame_pointer_rtx;
9451 insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9452 if (style < 0)
9453 add_frame_related_expr = true;
9455 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9458 insn = emit_insn (insn);
9459 if (style >= 0)
9460 ix86_add_queued_cfa_restore_notes (insn);
9462 if (set_cfa)
9464 rtx r;
9466 gcc_assert (m->fs.cfa_reg == src);
9467 m->fs.cfa_offset += INTVAL (offset);
9468 m->fs.cfa_reg = dest;
9470 r = gen_rtx_PLUS (Pmode, src, offset);
9471 r = gen_rtx_SET (VOIDmode, dest, r);
9472 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9473 RTX_FRAME_RELATED_P (insn) = 1;
9475 else if (style < 0)
9477 RTX_FRAME_RELATED_P (insn) = 1;
9478 if (add_frame_related_expr)
9480 rtx r = gen_rtx_PLUS (Pmode, src, offset);
9481 r = gen_rtx_SET (VOIDmode, dest, r);
9482 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9486 if (dest == stack_pointer_rtx)
9488 HOST_WIDE_INT ooffset = m->fs.sp_offset;
9489 bool valid = m->fs.sp_valid;
9491 if (src == hard_frame_pointer_rtx)
9493 valid = m->fs.fp_valid;
9494 ooffset = m->fs.fp_offset;
9496 else if (src == crtl->drap_reg)
9498 valid = m->fs.drap_valid;
9499 ooffset = 0;
9501 else
9503 /* Else there are two possibilities: SP itself, which we set
9504 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
9505 taken care of this by hand along the eh_return path. */
9506 gcc_checking_assert (src == stack_pointer_rtx
9507 || offset == const0_rtx);
9510 m->fs.sp_offset = ooffset - INTVAL (offset);
9511 m->fs.sp_valid = valid;
9515 /* Find an available register to be used as dynamic realign argument
9516 pointer regsiter. Such a register will be written in prologue and
9517 used in begin of body, so it must not be
9518 1. parameter passing register.
9519 2. GOT pointer.
9520 We reuse static-chain register if it is available. Otherwise, we
9521 use DI for i386 and R13 for x86-64. We chose R13 since it has
9522 shorter encoding.
9524 Return: the regno of chosen register. */
9526 static unsigned int
9527 find_drap_reg (void)
9529 tree decl = cfun->decl;
9531 if (TARGET_64BIT)
9533 /* Use R13 for nested function or function need static chain.
9534 Since function with tail call may use any caller-saved
9535 registers in epilogue, DRAP must not use caller-saved
9536 register in such case. */
9537 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9538 return R13_REG;
9540 return R10_REG;
9542 else
9544 /* Use DI for nested function or function need static chain.
9545 Since function with tail call may use any caller-saved
9546 registers in epilogue, DRAP must not use caller-saved
9547 register in such case. */
9548 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9549 return DI_REG;
9551 /* Reuse static chain register if it isn't used for parameter
9552 passing. */
9553 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9555 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9556 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9557 return CX_REG;
9559 return DI_REG;
9563 /* Return minimum incoming stack alignment. */
9565 static unsigned int
9566 ix86_minimum_incoming_stack_boundary (bool sibcall)
9568 unsigned int incoming_stack_boundary;
9570 /* Prefer the one specified at command line. */
9571 if (ix86_user_incoming_stack_boundary)
9572 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9573 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9574 if -mstackrealign is used, it isn't used for sibcall check and
9575 estimated stack alignment is 128bit. */
9576 else if (!sibcall
9577 && !TARGET_64BIT
9578 && ix86_force_align_arg_pointer
9579 && crtl->stack_alignment_estimated == 128)
9580 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9581 else
9582 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9584 /* Incoming stack alignment can be changed on individual functions
9585 via force_align_arg_pointer attribute. We use the smallest
9586 incoming stack boundary. */
9587 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9588 && lookup_attribute (ix86_force_align_arg_pointer_string,
9589 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9590 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9592 /* The incoming stack frame has to be aligned at least at
9593 parm_stack_boundary. */
9594 if (incoming_stack_boundary < crtl->parm_stack_boundary)
9595 incoming_stack_boundary = crtl->parm_stack_boundary;
9597 /* Stack at entrance of main is aligned by runtime. We use the
9598 smallest incoming stack boundary. */
9599 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9600 && DECL_NAME (current_function_decl)
9601 && MAIN_NAME_P (DECL_NAME (current_function_decl))
9602 && DECL_FILE_SCOPE_P (current_function_decl))
9603 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9605 return incoming_stack_boundary;
9608 /* Update incoming stack boundary and estimated stack alignment. */
9610 static void
9611 ix86_update_stack_boundary (void)
9613 ix86_incoming_stack_boundary
9614 = ix86_minimum_incoming_stack_boundary (false);
9616 /* x86_64 vararg needs 16byte stack alignment for register save
9617 area. */
9618 if (TARGET_64BIT
9619 && cfun->stdarg
9620 && crtl->stack_alignment_estimated < 128)
9621 crtl->stack_alignment_estimated = 128;
9624 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
9625 needed or an rtx for DRAP otherwise. */
9627 static rtx
9628 ix86_get_drap_rtx (void)
9630 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
9631 crtl->need_drap = true;
9633 if (stack_realign_drap)
9635 /* Assign DRAP to vDRAP and returns vDRAP */
9636 unsigned int regno = find_drap_reg ();
9637 rtx drap_vreg;
9638 rtx arg_ptr;
9639 rtx seq, insn;
9641 arg_ptr = gen_rtx_REG (Pmode, regno);
9642 crtl->drap_reg = arg_ptr;
9644 start_sequence ();
9645 drap_vreg = copy_to_reg (arg_ptr);
9646 seq = get_insns ();
9647 end_sequence ();
9649 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
9650 if (!optimize)
9652 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
9653 RTX_FRAME_RELATED_P (insn) = 1;
9655 return drap_vreg;
9657 else
9658 return NULL;
9661 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
9663 static rtx
9664 ix86_internal_arg_pointer (void)
9666 return virtual_incoming_args_rtx;
9669 struct scratch_reg {
9670 rtx reg;
9671 bool saved;
9674 /* Return a short-lived scratch register for use on function entry.
9675 In 32-bit mode, it is valid only after the registers are saved
9676 in the prologue. This register must be released by means of
9677 release_scratch_register_on_entry once it is dead. */
9679 static void
9680 get_scratch_register_on_entry (struct scratch_reg *sr)
9682 int regno;
9684 sr->saved = false;
9686 if (TARGET_64BIT)
9688 /* We always use R11 in 64-bit mode. */
9689 regno = R11_REG;
9691 else
9693 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
9694 bool fastcall_p
9695 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9696 bool thiscall_p
9697 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9698 bool static_chain_p = DECL_STATIC_CHAIN (decl);
9699 int regparm = ix86_function_regparm (fntype, decl);
9700 int drap_regno
9701 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
9703 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
9704 for the static chain register. */
9705 if ((regparm < 1 || (fastcall_p && !static_chain_p))
9706 && drap_regno != AX_REG)
9707 regno = AX_REG;
9708 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
9709 for the static chain register. */
9710 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
9711 regno = AX_REG;
9712 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
9713 regno = DX_REG;
9714 /* ecx is the static chain register. */
9715 else if (regparm < 3 && !fastcall_p && !thiscall_p
9716 && !static_chain_p
9717 && drap_regno != CX_REG)
9718 regno = CX_REG;
9719 else if (ix86_save_reg (BX_REG, true))
9720 regno = BX_REG;
9721 /* esi is the static chain register. */
9722 else if (!(regparm == 3 && static_chain_p)
9723 && ix86_save_reg (SI_REG, true))
9724 regno = SI_REG;
9725 else if (ix86_save_reg (DI_REG, true))
9726 regno = DI_REG;
9727 else
9729 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
9730 sr->saved = true;
9734 sr->reg = gen_rtx_REG (Pmode, regno);
9735 if (sr->saved)
9737 rtx insn = emit_insn (gen_push (sr->reg));
9738 RTX_FRAME_RELATED_P (insn) = 1;
9742 /* Release a scratch register obtained from the preceding function. */
9744 static void
9745 release_scratch_register_on_entry (struct scratch_reg *sr)
9747 if (sr->saved)
9749 struct machine_function *m = cfun->machine;
9750 rtx x, insn = emit_insn (gen_pop (sr->reg));
9752 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
9753 RTX_FRAME_RELATED_P (insn) = 1;
9754 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
9755 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
9756 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
9757 m->fs.sp_offset -= UNITS_PER_WORD;
9761 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
9763 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
9765 static void
9766 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
9768 /* We skip the probe for the first interval + a small dope of 4 words and
9769 probe that many bytes past the specified size to maintain a protection
9770 area at the botton of the stack. */
9771 const int dope = 4 * UNITS_PER_WORD;
9772 rtx size_rtx = GEN_INT (size), last;
9774 /* See if we have a constant small number of probes to generate. If so,
9775 that's the easy case. The run-time loop is made up of 11 insns in the
9776 generic case while the compile-time loop is made up of 3+2*(n-1) insns
9777 for n # of intervals. */
9778 if (size <= 5 * PROBE_INTERVAL)
9780 HOST_WIDE_INT i, adjust;
9781 bool first_probe = true;
9783 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
9784 values of N from 1 until it exceeds SIZE. If only one probe is
9785 needed, this will not generate any code. Then adjust and probe
9786 to PROBE_INTERVAL + SIZE. */
9787 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9789 if (first_probe)
9791 adjust = 2 * PROBE_INTERVAL + dope;
9792 first_probe = false;
9794 else
9795 adjust = PROBE_INTERVAL;
9797 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9798 plus_constant (Pmode, stack_pointer_rtx,
9799 -adjust)));
9800 emit_stack_probe (stack_pointer_rtx);
9803 if (first_probe)
9804 adjust = size + PROBE_INTERVAL + dope;
9805 else
9806 adjust = size + PROBE_INTERVAL - i;
9808 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9809 plus_constant (Pmode, stack_pointer_rtx,
9810 -adjust)));
9811 emit_stack_probe (stack_pointer_rtx);
9813 /* Adjust back to account for the additional first interval. */
9814 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9815 plus_constant (Pmode, stack_pointer_rtx,
9816 PROBE_INTERVAL + dope)));
9819 /* Otherwise, do the same as above, but in a loop. Note that we must be
9820 extra careful with variables wrapping around because we might be at
9821 the very top (or the very bottom) of the address space and we have
9822 to be able to handle this case properly; in particular, we use an
9823 equality test for the loop condition. */
9824 else
9826 HOST_WIDE_INT rounded_size;
9827 struct scratch_reg sr;
9829 get_scratch_register_on_entry (&sr);
9832 /* Step 1: round SIZE to the previous multiple of the interval. */
9834 rounded_size = size & -PROBE_INTERVAL;
9837 /* Step 2: compute initial and final value of the loop counter. */
9839 /* SP = SP_0 + PROBE_INTERVAL. */
9840 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9841 plus_constant (Pmode, stack_pointer_rtx,
9842 - (PROBE_INTERVAL + dope))));
9844 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
9845 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
9846 emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
9847 gen_rtx_PLUS (Pmode, sr.reg,
9848 stack_pointer_rtx)));
9851 /* Step 3: the loop
9853 while (SP != LAST_ADDR)
9855 SP = SP + PROBE_INTERVAL
9856 probe at SP
9859 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
9860 values of N from 1 until it is equal to ROUNDED_SIZE. */
9862 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
9865 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
9866 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
9868 if (size != rounded_size)
9870 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9871 plus_constant (Pmode, stack_pointer_rtx,
9872 rounded_size - size)));
9873 emit_stack_probe (stack_pointer_rtx);
9876 /* Adjust back to account for the additional first interval. */
9877 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9878 plus_constant (Pmode, stack_pointer_rtx,
9879 PROBE_INTERVAL + dope)));
9881 release_scratch_register_on_entry (&sr);
9884 gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
9886 /* Even if the stack pointer isn't the CFA register, we need to correctly
9887 describe the adjustments made to it, in particular differentiate the
9888 frame-related ones from the frame-unrelated ones. */
9889 if (size > 0)
9891 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
9892 XVECEXP (expr, 0, 0)
9893 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9894 plus_constant (Pmode, stack_pointer_rtx, -size));
9895 XVECEXP (expr, 0, 1)
9896 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9897 plus_constant (Pmode, stack_pointer_rtx,
9898 PROBE_INTERVAL + dope + size));
9899 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
9900 RTX_FRAME_RELATED_P (last) = 1;
9902 cfun->machine->fs.sp_offset += size;
9905 /* Make sure nothing is scheduled before we are done. */
9906 emit_insn (gen_blockage ());
9909 /* Adjust the stack pointer up to REG while probing it. */
9911 const char *
9912 output_adjust_stack_and_probe (rtx reg)
9914 static int labelno = 0;
9915 char loop_lab[32], end_lab[32];
9916 rtx xops[2];
9918 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9919 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9921 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9923 /* Jump to END_LAB if SP == LAST_ADDR. */
9924 xops[0] = stack_pointer_rtx;
9925 xops[1] = reg;
9926 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9927 fputs ("\tje\t", asm_out_file);
9928 assemble_name_raw (asm_out_file, end_lab);
9929 fputc ('\n', asm_out_file);
9931 /* SP = SP + PROBE_INTERVAL. */
9932 xops[1] = GEN_INT (PROBE_INTERVAL);
9933 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9935 /* Probe at SP. */
9936 xops[1] = const0_rtx;
9937 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
9939 fprintf (asm_out_file, "\tjmp\t");
9940 assemble_name_raw (asm_out_file, loop_lab);
9941 fputc ('\n', asm_out_file);
9943 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9945 return "";
9948 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
9949 inclusive. These are offsets from the current stack pointer. */
9951 static void
9952 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
9954 /* See if we have a constant small number of probes to generate. If so,
9955 that's the easy case. The run-time loop is made up of 7 insns in the
9956 generic case while the compile-time loop is made up of n insns for n #
9957 of intervals. */
9958 if (size <= 7 * PROBE_INTERVAL)
9960 HOST_WIDE_INT i;
9962 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
9963 it exceeds SIZE. If only one probe is needed, this will not
9964 generate any code. Then probe at FIRST + SIZE. */
9965 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9966 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9967 -(first + i)));
9969 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9970 -(first + size)));
9973 /* Otherwise, do the same as above, but in a loop. Note that we must be
9974 extra careful with variables wrapping around because we might be at
9975 the very top (or the very bottom) of the address space and we have
9976 to be able to handle this case properly; in particular, we use an
9977 equality test for the loop condition. */
9978 else
9980 HOST_WIDE_INT rounded_size, last;
9981 struct scratch_reg sr;
9983 get_scratch_register_on_entry (&sr);
9986 /* Step 1: round SIZE to the previous multiple of the interval. */
9988 rounded_size = size & -PROBE_INTERVAL;
9991 /* Step 2: compute initial and final value of the loop counter. */
9993 /* TEST_OFFSET = FIRST. */
9994 emit_move_insn (sr.reg, GEN_INT (-first));
9996 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
9997 last = first + rounded_size;
10000 /* Step 3: the loop
10002 while (TEST_ADDR != LAST_ADDR)
10004 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
10005 probe at TEST_ADDR
10008 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
10009 until it is equal to ROUNDED_SIZE. */
10011 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
10014 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
10015 that SIZE is equal to ROUNDED_SIZE. */
10017 if (size != rounded_size)
10018 emit_stack_probe (plus_constant (Pmode,
10019 gen_rtx_PLUS (Pmode,
10020 stack_pointer_rtx,
10021 sr.reg),
10022 rounded_size - size));
10024 release_scratch_register_on_entry (&sr);
10027 /* Make sure nothing is scheduled before we are done. */
10028 emit_insn (gen_blockage ());
10031 /* Probe a range of stack addresses from REG to END, inclusive. These are
10032 offsets from the current stack pointer. */
10034 const char *
10035 output_probe_stack_range (rtx reg, rtx end)
10037 static int labelno = 0;
10038 char loop_lab[32], end_lab[32];
10039 rtx xops[3];
10041 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10042 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10044 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10046 /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */
10047 xops[0] = reg;
10048 xops[1] = end;
10049 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10050 fputs ("\tje\t", asm_out_file);
10051 assemble_name_raw (asm_out_file, end_lab);
10052 fputc ('\n', asm_out_file);
10054 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
10055 xops[1] = GEN_INT (PROBE_INTERVAL);
10056 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10058 /* Probe at TEST_ADDR. */
10059 xops[0] = stack_pointer_rtx;
10060 xops[1] = reg;
10061 xops[2] = const0_rtx;
10062 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
10064 fprintf (asm_out_file, "\tjmp\t");
10065 assemble_name_raw (asm_out_file, loop_lab);
10066 fputc ('\n', asm_out_file);
10068 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10070 return "";
10073 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
10074 to be generated in correct form. */
10075 static void
10076 ix86_finalize_stack_realign_flags (void)
10078 /* Check if stack realign is really needed after reload, and
10079 stores result in cfun */
10080 unsigned int incoming_stack_boundary
10081 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
10082 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
10083 unsigned int stack_realign = (incoming_stack_boundary
10084 < (crtl->is_leaf
10085 ? crtl->max_used_stack_slot_alignment
10086 : crtl->stack_alignment_needed));
10088 if (crtl->stack_realign_finalized)
10090 /* After stack_realign_needed is finalized, we can't no longer
10091 change it. */
10092 gcc_assert (crtl->stack_realign_needed == stack_realign);
10093 return;
10096 /* If the only reason for frame_pointer_needed is that we conservatively
10097 assumed stack realignment might be needed, but in the end nothing that
10098 needed the stack alignment had been spilled, clear frame_pointer_needed
10099 and say we don't need stack realignment. */
10100 if (stack_realign
10101 && !crtl->need_drap
10102 && frame_pointer_needed
10103 && crtl->is_leaf
10104 && flag_omit_frame_pointer
10105 && crtl->sp_is_unchanging
10106 && !ix86_current_function_calls_tls_descriptor
10107 && !crtl->accesses_prior_frames
10108 && !cfun->calls_alloca
10109 && !crtl->calls_eh_return
10110 && !(flag_stack_check && STACK_CHECK_MOVING_SP)
10111 && !ix86_frame_pointer_required ()
10112 && get_frame_size () == 0
10113 && ix86_nsaved_sseregs () == 0
10114 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
10116 HARD_REG_SET set_up_by_prologue, prologue_used;
10117 basic_block bb;
10119 CLEAR_HARD_REG_SET (prologue_used);
10120 CLEAR_HARD_REG_SET (set_up_by_prologue);
10121 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
10122 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
10123 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
10124 HARD_FRAME_POINTER_REGNUM);
10125 FOR_EACH_BB (bb)
10127 rtx insn;
10128 FOR_BB_INSNS (bb, insn)
10129 if (NONDEBUG_INSN_P (insn)
10130 && requires_stack_frame_p (insn, prologue_used,
10131 set_up_by_prologue))
10133 crtl->stack_realign_needed = stack_realign;
10134 crtl->stack_realign_finalized = true;
10135 return;
10139 frame_pointer_needed = false;
10140 stack_realign = false;
10141 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
10142 crtl->stack_alignment_needed = incoming_stack_boundary;
10143 crtl->stack_alignment_estimated = incoming_stack_boundary;
10144 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
10145 crtl->preferred_stack_boundary = incoming_stack_boundary;
10146 df_finish_pass (true);
10147 df_scan_alloc (NULL);
10148 df_scan_blocks ();
10149 df_compute_regs_ever_live (true);
10150 df_analyze ();
10153 crtl->stack_realign_needed = stack_realign;
10154 crtl->stack_realign_finalized = true;
10157 /* Expand the prologue into a bunch of separate insns. */
10159 void
10160 ix86_expand_prologue (void)
10162 struct machine_function *m = cfun->machine;
10163 rtx insn, t;
10164 bool pic_reg_used;
10165 struct ix86_frame frame;
10166 HOST_WIDE_INT allocate;
10167 bool int_registers_saved;
10168 bool sse_registers_saved;
10170 ix86_finalize_stack_realign_flags ();
10172 /* DRAP should not coexist with stack_realign_fp */
10173 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
10175 memset (&m->fs, 0, sizeof (m->fs));
10177 /* Initialize CFA state for before the prologue. */
10178 m->fs.cfa_reg = stack_pointer_rtx;
10179 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
10181 /* Track SP offset to the CFA. We continue tracking this after we've
10182 swapped the CFA register away from SP. In the case of re-alignment
10183 this is fudged; we're interested to offsets within the local frame. */
10184 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10185 m->fs.sp_valid = true;
10187 ix86_compute_frame_layout (&frame);
10189 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
10191 /* We should have already generated an error for any use of
10192 ms_hook on a nested function. */
10193 gcc_checking_assert (!ix86_static_chain_on_stack);
10195 /* Check if profiling is active and we shall use profiling before
10196 prologue variant. If so sorry. */
10197 if (crtl->profile && flag_fentry != 0)
10198 sorry ("ms_hook_prologue attribute isn%'t compatible "
10199 "with -mfentry for 32-bit");
10201 /* In ix86_asm_output_function_label we emitted:
10202 8b ff movl.s %edi,%edi
10203 55 push %ebp
10204 8b ec movl.s %esp,%ebp
10206 This matches the hookable function prologue in Win32 API
10207 functions in Microsoft Windows XP Service Pack 2 and newer.
10208 Wine uses this to enable Windows apps to hook the Win32 API
10209 functions provided by Wine.
10211 What that means is that we've already set up the frame pointer. */
10213 if (frame_pointer_needed
10214 && !(crtl->drap_reg && crtl->stack_realign_needed))
10216 rtx push, mov;
10218 /* We've decided to use the frame pointer already set up.
10219 Describe this to the unwinder by pretending that both
10220 push and mov insns happen right here.
10222 Putting the unwind info here at the end of the ms_hook
10223 is done so that we can make absolutely certain we get
10224 the required byte sequence at the start of the function,
10225 rather than relying on an assembler that can produce
10226 the exact encoding required.
10228 However it does mean (in the unpatched case) that we have
10229 a 1 insn window where the asynchronous unwind info is
10230 incorrect. However, if we placed the unwind info at
10231 its correct location we would have incorrect unwind info
10232 in the patched case. Which is probably all moot since
10233 I don't expect Wine generates dwarf2 unwind info for the
10234 system libraries that use this feature. */
10236 insn = emit_insn (gen_blockage ());
10238 push = gen_push (hard_frame_pointer_rtx);
10239 mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
10240 stack_pointer_rtx);
10241 RTX_FRAME_RELATED_P (push) = 1;
10242 RTX_FRAME_RELATED_P (mov) = 1;
10244 RTX_FRAME_RELATED_P (insn) = 1;
10245 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10246 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
10248 /* Note that gen_push incremented m->fs.cfa_offset, even
10249 though we didn't emit the push insn here. */
10250 m->fs.cfa_reg = hard_frame_pointer_rtx;
10251 m->fs.fp_offset = m->fs.cfa_offset;
10252 m->fs.fp_valid = true;
10254 else
10256 /* The frame pointer is not needed so pop %ebp again.
10257 This leaves us with a pristine state. */
10258 emit_insn (gen_pop (hard_frame_pointer_rtx));
10262 /* The first insn of a function that accepts its static chain on the
10263 stack is to push the register that would be filled in by a direct
10264 call. This insn will be skipped by the trampoline. */
10265 else if (ix86_static_chain_on_stack)
10267 insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
10268 emit_insn (gen_blockage ());
10270 /* We don't want to interpret this push insn as a register save,
10271 only as a stack adjustment. The real copy of the register as
10272 a save will be done later, if needed. */
10273 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
10274 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
10275 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10276 RTX_FRAME_RELATED_P (insn) = 1;
10279 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10280 of DRAP is needed and stack realignment is really needed after reload */
10281 if (stack_realign_drap)
10283 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10285 /* Only need to push parameter pointer reg if it is caller saved. */
10286 if (!call_used_regs[REGNO (crtl->drap_reg)])
10288 /* Push arg pointer reg */
10289 insn = emit_insn (gen_push (crtl->drap_reg));
10290 RTX_FRAME_RELATED_P (insn) = 1;
10293 /* Grab the argument pointer. */
10294 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
10295 insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10296 RTX_FRAME_RELATED_P (insn) = 1;
10297 m->fs.cfa_reg = crtl->drap_reg;
10298 m->fs.cfa_offset = 0;
10300 /* Align the stack. */
10301 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10302 stack_pointer_rtx,
10303 GEN_INT (-align_bytes)));
10304 RTX_FRAME_RELATED_P (insn) = 1;
10306 /* Replicate the return address on the stack so that return
10307 address can be reached via (argp - 1) slot. This is needed
10308 to implement macro RETURN_ADDR_RTX and intrinsic function
10309 expand_builtin_return_addr etc. */
10310 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
10311 t = gen_frame_mem (word_mode, t);
10312 insn = emit_insn (gen_push (t));
10313 RTX_FRAME_RELATED_P (insn) = 1;
10315 /* For the purposes of frame and register save area addressing,
10316 we've started over with a new frame. */
10317 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10318 m->fs.realigned = true;
10321 int_registers_saved = (frame.nregs == 0);
10322 sse_registers_saved = (frame.nsseregs == 0);
10324 if (frame_pointer_needed && !m->fs.fp_valid)
10326 /* Note: AT&T enter does NOT have reversed args. Enter is probably
10327 slower on all targets. Also sdb doesn't like it. */
10328 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10329 RTX_FRAME_RELATED_P (insn) = 1;
10331 /* Push registers now, before setting the frame pointer
10332 on SEH target. */
10333 if (!int_registers_saved
10334 && TARGET_SEH
10335 && !frame.save_regs_using_mov)
10337 ix86_emit_save_regs ();
10338 int_registers_saved = true;
10339 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10342 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10344 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10345 RTX_FRAME_RELATED_P (insn) = 1;
10347 if (m->fs.cfa_reg == stack_pointer_rtx)
10348 m->fs.cfa_reg = hard_frame_pointer_rtx;
10349 m->fs.fp_offset = m->fs.sp_offset;
10350 m->fs.fp_valid = true;
10354 if (!int_registers_saved)
10356 /* If saving registers via PUSH, do so now. */
10357 if (!frame.save_regs_using_mov)
10359 ix86_emit_save_regs ();
10360 int_registers_saved = true;
10361 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10364 /* When using red zone we may start register saving before allocating
10365 the stack frame saving one cycle of the prologue. However, avoid
10366 doing this if we have to probe the stack; at least on x86_64 the
10367 stack probe can turn into a call that clobbers a red zone location. */
10368 else if (ix86_using_red_zone ()
10369 && (! TARGET_STACK_PROBE
10370 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10372 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10373 int_registers_saved = true;
10377 if (stack_realign_fp)
10379 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10380 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10382 /* The computation of the size of the re-aligned stack frame means
10383 that we must allocate the size of the register save area before
10384 performing the actual alignment. Otherwise we cannot guarantee
10385 that there's enough storage above the realignment point. */
10386 if (m->fs.sp_offset != frame.sse_reg_save_offset)
10387 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10388 GEN_INT (m->fs.sp_offset
10389 - frame.sse_reg_save_offset),
10390 -1, false);
10392 /* Align the stack. */
10393 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10394 stack_pointer_rtx,
10395 GEN_INT (-align_bytes)));
10397 /* For the purposes of register save area addressing, the stack
10398 pointer is no longer valid. As for the value of sp_offset,
10399 see ix86_compute_frame_layout, which we need to match in order
10400 to pass verification of stack_pointer_offset at the end. */
10401 m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10402 m->fs.sp_valid = false;
10405 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10407 if (flag_stack_usage_info)
10409 /* We start to count from ARG_POINTER. */
10410 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10412 /* If it was realigned, take into account the fake frame. */
10413 if (stack_realign_drap)
10415 if (ix86_static_chain_on_stack)
10416 stack_size += UNITS_PER_WORD;
10418 if (!call_used_regs[REGNO (crtl->drap_reg)])
10419 stack_size += UNITS_PER_WORD;
10421 /* This over-estimates by 1 minimal-stack-alignment-unit but
10422 mitigates that by counting in the new return address slot. */
10423 current_function_dynamic_stack_size
10424 += crtl->stack_alignment_needed / BITS_PER_UNIT;
10427 current_function_static_stack_size = stack_size;
10430 /* On SEH target with very large frame size, allocate an area to save
10431 SSE registers (as the very large allocation won't be described). */
10432 if (TARGET_SEH
10433 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
10434 && !sse_registers_saved)
10436 HOST_WIDE_INT sse_size =
10437 frame.sse_reg_save_offset - frame.reg_save_offset;
10439 gcc_assert (int_registers_saved);
10441 /* No need to do stack checking as the area will be immediately
10442 written. */
10443 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10444 GEN_INT (-sse_size), -1,
10445 m->fs.cfa_reg == stack_pointer_rtx);
10446 allocate -= sse_size;
10447 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10448 sse_registers_saved = true;
10451 /* The stack has already been decremented by the instruction calling us
10452 so probe if the size is non-negative to preserve the protection area. */
10453 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10455 /* We expect the registers to be saved when probes are used. */
10456 gcc_assert (int_registers_saved);
10458 if (STACK_CHECK_MOVING_SP)
10460 ix86_adjust_stack_and_probe (allocate);
10461 allocate = 0;
10463 else
10465 HOST_WIDE_INT size = allocate;
10467 if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10468 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10470 if (TARGET_STACK_PROBE)
10471 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10472 else
10473 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10477 if (allocate == 0)
10479 else if (!ix86_target_stack_probe ()
10480 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10482 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10483 GEN_INT (-allocate), -1,
10484 m->fs.cfa_reg == stack_pointer_rtx);
10486 else
10488 rtx eax = gen_rtx_REG (Pmode, AX_REG);
10489 rtx r10 = NULL;
10490 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10491 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
10492 bool eax_live = false;
10493 bool r10_live = false;
10495 if (TARGET_64BIT)
10496 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10497 if (!TARGET_64BIT_MS_ABI)
10498 eax_live = ix86_eax_live_at_start_p ();
10500 /* Note that SEH directives need to continue tracking the stack
10501 pointer even after the frame pointer has been set up. */
10502 if (eax_live)
10504 insn = emit_insn (gen_push (eax));
10505 allocate -= UNITS_PER_WORD;
10506 if (sp_is_cfa_reg || TARGET_SEH)
10508 if (sp_is_cfa_reg)
10509 m->fs.cfa_offset += UNITS_PER_WORD;
10510 RTX_FRAME_RELATED_P (insn) = 1;
10514 if (r10_live)
10516 r10 = gen_rtx_REG (Pmode, R10_REG);
10517 insn = emit_insn (gen_push (r10));
10518 allocate -= UNITS_PER_WORD;
10519 if (sp_is_cfa_reg || TARGET_SEH)
10521 if (sp_is_cfa_reg)
10522 m->fs.cfa_offset += UNITS_PER_WORD;
10523 RTX_FRAME_RELATED_P (insn) = 1;
10527 emit_move_insn (eax, GEN_INT (allocate));
10528 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10530 /* Use the fact that AX still contains ALLOCATE. */
10531 adjust_stack_insn = (Pmode == DImode
10532 ? gen_pro_epilogue_adjust_stack_di_sub
10533 : gen_pro_epilogue_adjust_stack_si_sub);
10535 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10536 stack_pointer_rtx, eax));
10538 if (sp_is_cfa_reg || TARGET_SEH)
10540 if (sp_is_cfa_reg)
10541 m->fs.cfa_offset += allocate;
10542 RTX_FRAME_RELATED_P (insn) = 1;
10543 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10544 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10545 plus_constant (Pmode, stack_pointer_rtx,
10546 -allocate)));
10548 m->fs.sp_offset += allocate;
10550 if (r10_live && eax_live)
10552 t = choose_baseaddr (m->fs.sp_offset - allocate);
10553 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
10554 gen_frame_mem (word_mode, t));
10555 t = choose_baseaddr (m->fs.sp_offset - allocate - UNITS_PER_WORD);
10556 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
10557 gen_frame_mem (word_mode, t));
10559 else if (eax_live || r10_live)
10561 t = choose_baseaddr (m->fs.sp_offset - allocate);
10562 emit_move_insn (gen_rtx_REG (word_mode,
10563 (eax_live ? AX_REG : R10_REG)),
10564 gen_frame_mem (word_mode, t));
10567 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10569 /* If we havn't already set up the frame pointer, do so now. */
10570 if (frame_pointer_needed && !m->fs.fp_valid)
10572 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10573 GEN_INT (frame.stack_pointer_offset
10574 - frame.hard_frame_pointer_offset));
10575 insn = emit_insn (insn);
10576 RTX_FRAME_RELATED_P (insn) = 1;
10577 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10579 if (m->fs.cfa_reg == stack_pointer_rtx)
10580 m->fs.cfa_reg = hard_frame_pointer_rtx;
10581 m->fs.fp_offset = frame.hard_frame_pointer_offset;
10582 m->fs.fp_valid = true;
10585 if (!int_registers_saved)
10586 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10587 if (!sse_registers_saved)
10588 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10590 pic_reg_used = false;
10591 if (pic_offset_table_rtx
10592 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10593 || crtl->profile))
10595 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
10597 if (alt_pic_reg_used != INVALID_REGNUM)
10598 SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
10600 pic_reg_used = true;
10603 if (pic_reg_used)
10605 if (TARGET_64BIT)
10607 if (ix86_cmodel == CM_LARGE_PIC)
10609 rtx label, tmp_reg;
10611 gcc_assert (Pmode == DImode);
10612 label = gen_label_rtx ();
10613 emit_label (label);
10614 LABEL_PRESERVE_P (label) = 1;
10615 tmp_reg = gen_rtx_REG (Pmode, R11_REG);
10616 gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
10617 insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
10618 label));
10619 insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
10620 insn = emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
10621 pic_offset_table_rtx, tmp_reg));
10623 else
10624 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
10626 else
10628 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
10629 RTX_FRAME_RELATED_P (insn) = 1;
10630 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
10634 /* In the pic_reg_used case, make sure that the got load isn't deleted
10635 when mcount needs it. Blockage to avoid call movement across mcount
10636 call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
10637 note. */
10638 if (crtl->profile && !flag_fentry && pic_reg_used)
10639 emit_insn (gen_prologue_use (pic_offset_table_rtx));
10641 if (crtl->drap_reg && !crtl->stack_realign_needed)
10643 /* vDRAP is setup but after reload it turns out stack realign
10644 isn't necessary, here we will emit prologue to setup DRAP
10645 without stack realign adjustment */
10646 t = choose_baseaddr (0);
10647 emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10650 /* Prevent instructions from being scheduled into register save push
10651 sequence when access to the redzone area is done through frame pointer.
10652 The offset between the frame pointer and the stack pointer is calculated
10653 relative to the value of the stack pointer at the end of the function
10654 prologue, and moving instructions that access redzone area via frame
10655 pointer inside push sequence violates this assumption. */
10656 if (frame_pointer_needed && frame.red_zone_size)
10657 emit_insn (gen_memory_blockage ());
10659 /* Emit cld instruction if stringops are used in the function. */
10660 if (TARGET_CLD && ix86_current_function_needs_cld)
10661 emit_insn (gen_cld ());
10663 /* SEH requires that the prologue end within 256 bytes of the start of
10664 the function. Prevent instruction schedules that would extend that.
10665 Further, prevent alloca modifications to the stack pointer from being
10666 combined with prologue modifications. */
10667 if (TARGET_SEH)
10668 emit_insn (gen_prologue_use (stack_pointer_rtx));
10671 /* Emit code to restore REG using a POP insn. */
10673 static void
10674 ix86_emit_restore_reg_using_pop (rtx reg)
10676 struct machine_function *m = cfun->machine;
10677 rtx insn = emit_insn (gen_pop (reg));
10679 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
10680 m->fs.sp_offset -= UNITS_PER_WORD;
10682 if (m->fs.cfa_reg == crtl->drap_reg
10683 && REGNO (reg) == REGNO (crtl->drap_reg))
10685 /* Previously we'd represented the CFA as an expression
10686 like *(%ebp - 8). We've just popped that value from
10687 the stack, which means we need to reset the CFA to
10688 the drap register. This will remain until we restore
10689 the stack pointer. */
10690 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10691 RTX_FRAME_RELATED_P (insn) = 1;
10693 /* This means that the DRAP register is valid for addressing too. */
10694 m->fs.drap_valid = true;
10695 return;
10698 if (m->fs.cfa_reg == stack_pointer_rtx)
10700 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
10701 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10702 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
10703 RTX_FRAME_RELATED_P (insn) = 1;
10705 m->fs.cfa_offset -= UNITS_PER_WORD;
10708 /* When the frame pointer is the CFA, and we pop it, we are
10709 swapping back to the stack pointer as the CFA. This happens
10710 for stack frames that don't allocate other data, so we assume
10711 the stack pointer is now pointing at the return address, i.e.
10712 the function entry state, which makes the offset be 1 word. */
10713 if (reg == hard_frame_pointer_rtx)
10715 m->fs.fp_valid = false;
10716 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10718 m->fs.cfa_reg = stack_pointer_rtx;
10719 m->fs.cfa_offset -= UNITS_PER_WORD;
10721 add_reg_note (insn, REG_CFA_DEF_CFA,
10722 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10723 GEN_INT (m->fs.cfa_offset)));
10724 RTX_FRAME_RELATED_P (insn) = 1;
10729 /* Emit code to restore saved registers using POP insns. */
10731 static void
10732 ix86_emit_restore_regs_using_pop (void)
10734 unsigned int regno;
10736 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10737 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
10738 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
10741 /* Emit code and notes for the LEAVE instruction. */
10743 static void
10744 ix86_emit_leave (void)
10746 struct machine_function *m = cfun->machine;
10747 rtx insn = emit_insn (ix86_gen_leave ());
10749 ix86_add_queued_cfa_restore_notes (insn);
10751 gcc_assert (m->fs.fp_valid);
10752 m->fs.sp_valid = true;
10753 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
10754 m->fs.fp_valid = false;
10756 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10758 m->fs.cfa_reg = stack_pointer_rtx;
10759 m->fs.cfa_offset = m->fs.sp_offset;
10761 add_reg_note (insn, REG_CFA_DEF_CFA,
10762 plus_constant (Pmode, stack_pointer_rtx,
10763 m->fs.sp_offset));
10764 RTX_FRAME_RELATED_P (insn) = 1;
10766 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
10767 m->fs.fp_offset);
10770 /* Emit code to restore saved registers using MOV insns.
10771 First register is restored from CFA - CFA_OFFSET. */
10772 static void
10773 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
10774 bool maybe_eh_return)
10776 struct machine_function *m = cfun->machine;
10777 unsigned int regno;
10779 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10780 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10782 rtx reg = gen_rtx_REG (word_mode, regno);
10783 rtx insn, mem;
10785 mem = choose_baseaddr (cfa_offset);
10786 mem = gen_frame_mem (word_mode, mem);
10787 insn = emit_move_insn (reg, mem);
10789 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
10791 /* Previously we'd represented the CFA as an expression
10792 like *(%ebp - 8). We've just popped that value from
10793 the stack, which means we need to reset the CFA to
10794 the drap register. This will remain until we restore
10795 the stack pointer. */
10796 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10797 RTX_FRAME_RELATED_P (insn) = 1;
10799 /* This means that the DRAP register is valid for addressing. */
10800 m->fs.drap_valid = true;
10802 else
10803 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10805 cfa_offset -= UNITS_PER_WORD;
10809 /* Emit code to restore saved registers using MOV insns.
10810 First register is restored from CFA - CFA_OFFSET. */
10811 static void
10812 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
10813 bool maybe_eh_return)
10815 unsigned int regno;
10817 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10818 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10820 rtx reg = gen_rtx_REG (V4SFmode, regno);
10821 rtx mem;
10823 mem = choose_baseaddr (cfa_offset);
10824 mem = gen_rtx_MEM (V4SFmode, mem);
10825 set_mem_align (mem, 128);
10826 emit_move_insn (reg, mem);
10828 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10830 cfa_offset -= 16;
10834 /* Restore function stack, frame, and registers. */
10836 void
10837 ix86_expand_epilogue (int style)
10839 struct machine_function *m = cfun->machine;
10840 struct machine_frame_state frame_state_save = m->fs;
10841 struct ix86_frame frame;
10842 bool restore_regs_via_mov;
10843 bool using_drap;
10845 ix86_finalize_stack_realign_flags ();
10846 ix86_compute_frame_layout (&frame);
10848 m->fs.sp_valid = (!frame_pointer_needed
10849 || (crtl->sp_is_unchanging
10850 && !stack_realign_fp));
10851 gcc_assert (!m->fs.sp_valid
10852 || m->fs.sp_offset == frame.stack_pointer_offset);
10854 /* The FP must be valid if the frame pointer is present. */
10855 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
10856 gcc_assert (!m->fs.fp_valid
10857 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
10859 /* We must have *some* valid pointer to the stack frame. */
10860 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
10862 /* The DRAP is never valid at this point. */
10863 gcc_assert (!m->fs.drap_valid);
10865 /* See the comment about red zone and frame
10866 pointer usage in ix86_expand_prologue. */
10867 if (frame_pointer_needed && frame.red_zone_size)
10868 emit_insn (gen_memory_blockage ());
10870 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
10871 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
10873 /* Determine the CFA offset of the end of the red-zone. */
10874 m->fs.red_zone_offset = 0;
10875 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
10877 /* The red-zone begins below the return address. */
10878 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
10880 /* When the register save area is in the aligned portion of
10881 the stack, determine the maximum runtime displacement that
10882 matches up with the aligned frame. */
10883 if (stack_realign_drap)
10884 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
10885 + UNITS_PER_WORD);
10888 /* Special care must be taken for the normal return case of a function
10889 using eh_return: the eax and edx registers are marked as saved, but
10890 not restored along this path. Adjust the save location to match. */
10891 if (crtl->calls_eh_return && style != 2)
10892 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
10894 /* EH_RETURN requires the use of moves to function properly. */
10895 if (crtl->calls_eh_return)
10896 restore_regs_via_mov = true;
10897 /* SEH requires the use of pops to identify the epilogue. */
10898 else if (TARGET_SEH)
10899 restore_regs_via_mov = false;
10900 /* If we're only restoring one register and sp is not valid then
10901 using a move instruction to restore the register since it's
10902 less work than reloading sp and popping the register. */
10903 else if (!m->fs.sp_valid && frame.nregs <= 1)
10904 restore_regs_via_mov = true;
10905 else if (TARGET_EPILOGUE_USING_MOVE
10906 && cfun->machine->use_fast_prologue_epilogue
10907 && (frame.nregs > 1
10908 || m->fs.sp_offset != frame.reg_save_offset))
10909 restore_regs_via_mov = true;
10910 else if (frame_pointer_needed
10911 && !frame.nregs
10912 && m->fs.sp_offset != frame.reg_save_offset)
10913 restore_regs_via_mov = true;
10914 else if (frame_pointer_needed
10915 && TARGET_USE_LEAVE
10916 && cfun->machine->use_fast_prologue_epilogue
10917 && frame.nregs == 1)
10918 restore_regs_via_mov = true;
10919 else
10920 restore_regs_via_mov = false;
10922 if (restore_regs_via_mov || frame.nsseregs)
10924 /* Ensure that the entire register save area is addressable via
10925 the stack pointer, if we will restore via sp. */
10926 if (TARGET_64BIT
10927 && m->fs.sp_offset > 0x7fffffff
10928 && !(m->fs.fp_valid || m->fs.drap_valid)
10929 && (frame.nsseregs + frame.nregs) != 0)
10931 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10932 GEN_INT (m->fs.sp_offset
10933 - frame.sse_reg_save_offset),
10934 style,
10935 m->fs.cfa_reg == stack_pointer_rtx);
10939 /* If there are any SSE registers to restore, then we have to do it
10940 via moves, since there's obviously no pop for SSE regs. */
10941 if (frame.nsseregs)
10942 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
10943 style == 2);
10945 if (restore_regs_via_mov)
10947 rtx t;
10949 if (frame.nregs)
10950 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
10952 /* eh_return epilogues need %ecx added to the stack pointer. */
10953 if (style == 2)
10955 rtx insn, sa = EH_RETURN_STACKADJ_RTX;
10957 /* Stack align doesn't work with eh_return. */
10958 gcc_assert (!stack_realign_drap);
10959 /* Neither does regparm nested functions. */
10960 gcc_assert (!ix86_static_chain_on_stack);
10962 if (frame_pointer_needed)
10964 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
10965 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
10966 emit_insn (gen_rtx_SET (VOIDmode, sa, t));
10968 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
10969 insn = emit_move_insn (hard_frame_pointer_rtx, t);
10971 /* Note that we use SA as a temporary CFA, as the return
10972 address is at the proper place relative to it. We
10973 pretend this happens at the FP restore insn because
10974 prior to this insn the FP would be stored at the wrong
10975 offset relative to SA, and after this insn we have no
10976 other reasonable register to use for the CFA. We don't
10977 bother resetting the CFA to the SP for the duration of
10978 the return insn. */
10979 add_reg_note (insn, REG_CFA_DEF_CFA,
10980 plus_constant (Pmode, sa, UNITS_PER_WORD));
10981 ix86_add_queued_cfa_restore_notes (insn);
10982 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
10983 RTX_FRAME_RELATED_P (insn) = 1;
10985 m->fs.cfa_reg = sa;
10986 m->fs.cfa_offset = UNITS_PER_WORD;
10987 m->fs.fp_valid = false;
10989 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
10990 const0_rtx, style, false);
10992 else
10994 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
10995 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
10996 insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
10997 ix86_add_queued_cfa_restore_notes (insn);
10999 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
11000 if (m->fs.cfa_offset != UNITS_PER_WORD)
11002 m->fs.cfa_offset = UNITS_PER_WORD;
11003 add_reg_note (insn, REG_CFA_DEF_CFA,
11004 plus_constant (Pmode, stack_pointer_rtx,
11005 UNITS_PER_WORD));
11006 RTX_FRAME_RELATED_P (insn) = 1;
11009 m->fs.sp_offset = UNITS_PER_WORD;
11010 m->fs.sp_valid = true;
11013 else
11015 /* SEH requires that the function end with (1) a stack adjustment
11016 if necessary, (2) a sequence of pops, and (3) a return or
11017 jump instruction. Prevent insns from the function body from
11018 being scheduled into this sequence. */
11019 if (TARGET_SEH)
11021 /* Prevent a catch region from being adjacent to the standard
11022 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
11023 several other flags that would be interesting to test are
11024 not yet set up. */
11025 if (flag_non_call_exceptions)
11026 emit_insn (gen_nops (const1_rtx));
11027 else
11028 emit_insn (gen_blockage ());
11031 /* First step is to deallocate the stack frame so that we can
11032 pop the registers. Also do it on SEH target for very large
11033 frame as the emitted instructions aren't allowed by the ABI in
11034 epilogues. */
11035 if (!m->fs.sp_valid
11036 || (TARGET_SEH
11037 && (m->fs.sp_offset - frame.reg_save_offset
11038 >= SEH_MAX_FRAME_SIZE)))
11040 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
11041 GEN_INT (m->fs.fp_offset
11042 - frame.reg_save_offset),
11043 style, false);
11045 else if (m->fs.sp_offset != frame.reg_save_offset)
11047 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11048 GEN_INT (m->fs.sp_offset
11049 - frame.reg_save_offset),
11050 style,
11051 m->fs.cfa_reg == stack_pointer_rtx);
11054 ix86_emit_restore_regs_using_pop ();
11057 /* If we used a stack pointer and haven't already got rid of it,
11058 then do so now. */
11059 if (m->fs.fp_valid)
11061 /* If the stack pointer is valid and pointing at the frame
11062 pointer store address, then we only need a pop. */
11063 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
11064 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11065 /* Leave results in shorter dependency chains on CPUs that are
11066 able to grok it fast. */
11067 else if (TARGET_USE_LEAVE
11068 || optimize_function_for_size_p (cfun)
11069 || !cfun->machine->use_fast_prologue_epilogue)
11070 ix86_emit_leave ();
11071 else
11073 pro_epilogue_adjust_stack (stack_pointer_rtx,
11074 hard_frame_pointer_rtx,
11075 const0_rtx, style, !using_drap);
11076 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11080 if (using_drap)
11082 int param_ptr_offset = UNITS_PER_WORD;
11083 rtx insn;
11085 gcc_assert (stack_realign_drap);
11087 if (ix86_static_chain_on_stack)
11088 param_ptr_offset += UNITS_PER_WORD;
11089 if (!call_used_regs[REGNO (crtl->drap_reg)])
11090 param_ptr_offset += UNITS_PER_WORD;
11092 insn = emit_insn (gen_rtx_SET
11093 (VOIDmode, stack_pointer_rtx,
11094 gen_rtx_PLUS (Pmode,
11095 crtl->drap_reg,
11096 GEN_INT (-param_ptr_offset))));
11097 m->fs.cfa_reg = stack_pointer_rtx;
11098 m->fs.cfa_offset = param_ptr_offset;
11099 m->fs.sp_offset = param_ptr_offset;
11100 m->fs.realigned = false;
11102 add_reg_note (insn, REG_CFA_DEF_CFA,
11103 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11104 GEN_INT (param_ptr_offset)));
11105 RTX_FRAME_RELATED_P (insn) = 1;
11107 if (!call_used_regs[REGNO (crtl->drap_reg)])
11108 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
11111 /* At this point the stack pointer must be valid, and we must have
11112 restored all of the registers. We may not have deallocated the
11113 entire stack frame. We've delayed this until now because it may
11114 be possible to merge the local stack deallocation with the
11115 deallocation forced by ix86_static_chain_on_stack. */
11116 gcc_assert (m->fs.sp_valid);
11117 gcc_assert (!m->fs.fp_valid);
11118 gcc_assert (!m->fs.realigned);
11119 if (m->fs.sp_offset != UNITS_PER_WORD)
11121 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11122 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
11123 style, true);
11125 else
11126 ix86_add_queued_cfa_restore_notes (get_last_insn ());
11128 /* Sibcall epilogues don't want a return instruction. */
11129 if (style == 0)
11131 m->fs = frame_state_save;
11132 return;
11135 if (crtl->args.pops_args && crtl->args.size)
11137 rtx popc = GEN_INT (crtl->args.pops_args);
11139 /* i386 can only pop 64K bytes. If asked to pop more, pop return
11140 address, do explicit add, and jump indirectly to the caller. */
11142 if (crtl->args.pops_args >= 65536)
11144 rtx ecx = gen_rtx_REG (SImode, CX_REG);
11145 rtx insn;
11147 /* There is no "pascal" calling convention in any 64bit ABI. */
11148 gcc_assert (!TARGET_64BIT);
11150 insn = emit_insn (gen_pop (ecx));
11151 m->fs.cfa_offset -= UNITS_PER_WORD;
11152 m->fs.sp_offset -= UNITS_PER_WORD;
11154 add_reg_note (insn, REG_CFA_ADJUST_CFA,
11155 copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
11156 add_reg_note (insn, REG_CFA_REGISTER,
11157 gen_rtx_SET (VOIDmode, ecx, pc_rtx));
11158 RTX_FRAME_RELATED_P (insn) = 1;
11160 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11161 popc, -1, true);
11162 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
11164 else
11165 emit_jump_insn (gen_simple_return_pop_internal (popc));
11167 else
11168 emit_jump_insn (gen_simple_return_internal ());
11170 /* Restore the state back to the state from the prologue,
11171 so that it's correct for the next epilogue. */
11172 m->fs = frame_state_save;
11175 /* Reset from the function's potential modifications. */
11177 static void
11178 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
11179 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
11181 if (pic_offset_table_rtx)
11182 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
11183 #if TARGET_MACHO
11184 /* Mach-O doesn't support labels at the end of objects, so if
11185 it looks like we might want one, insert a NOP. */
11187 rtx insn = get_last_insn ();
11188 rtx deleted_debug_label = NULL_RTX;
11189 while (insn
11190 && NOTE_P (insn)
11191 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
11193 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
11194 notes only, instead set their CODE_LABEL_NUMBER to -1,
11195 otherwise there would be code generation differences
11196 in between -g and -g0. */
11197 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11198 deleted_debug_label = insn;
11199 insn = PREV_INSN (insn);
11201 if (insn
11202 && (LABEL_P (insn)
11203 || (NOTE_P (insn)
11204 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
11205 fputs ("\tnop\n", file);
11206 else if (deleted_debug_label)
11207 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
11208 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11209 CODE_LABEL_NUMBER (insn) = -1;
11211 #endif
11215 /* Return a scratch register to use in the split stack prologue. The
11216 split stack prologue is used for -fsplit-stack. It is the first
11217 instructions in the function, even before the regular prologue.
11218 The scratch register can be any caller-saved register which is not
11219 used for parameters or for the static chain. */
11221 static unsigned int
11222 split_stack_prologue_scratch_regno (void)
11224 if (TARGET_64BIT)
11225 return R11_REG;
11226 else
11228 bool is_fastcall, is_thiscall;
11229 int regparm;
11231 is_fastcall = (lookup_attribute ("fastcall",
11232 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11233 != NULL);
11234 is_thiscall = (lookup_attribute ("thiscall",
11235 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11236 != NULL);
11237 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
11239 if (is_fastcall)
11241 if (DECL_STATIC_CHAIN (cfun->decl))
11243 sorry ("-fsplit-stack does not support fastcall with "
11244 "nested function");
11245 return INVALID_REGNUM;
11247 return AX_REG;
11249 else if (is_thiscall)
11251 if (!DECL_STATIC_CHAIN (cfun->decl))
11252 return DX_REG;
11253 return AX_REG;
11255 else if (regparm < 3)
11257 if (!DECL_STATIC_CHAIN (cfun->decl))
11258 return CX_REG;
11259 else
11261 if (regparm >= 2)
11263 sorry ("-fsplit-stack does not support 2 register "
11264 " parameters for a nested function");
11265 return INVALID_REGNUM;
11267 return DX_REG;
11270 else
11272 /* FIXME: We could make this work by pushing a register
11273 around the addition and comparison. */
11274 sorry ("-fsplit-stack does not support 3 register parameters");
11275 return INVALID_REGNUM;
11280 /* A SYMBOL_REF for the function which allocates new stackspace for
11281 -fsplit-stack. */
11283 static GTY(()) rtx split_stack_fn;
11285 /* A SYMBOL_REF for the more stack function when using the large
11286 model. */
11288 static GTY(()) rtx split_stack_fn_large;
11290 /* Handle -fsplit-stack. These are the first instructions in the
11291 function, even before the regular prologue. */
11293 void
11294 ix86_expand_split_stack_prologue (void)
11296 struct ix86_frame frame;
11297 HOST_WIDE_INT allocate;
11298 unsigned HOST_WIDE_INT args_size;
11299 rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
11300 rtx scratch_reg = NULL_RTX;
11301 rtx varargs_label = NULL_RTX;
11302 rtx fn;
11304 gcc_assert (flag_split_stack && reload_completed);
11306 ix86_finalize_stack_realign_flags ();
11307 ix86_compute_frame_layout (&frame);
11308 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
11310 /* This is the label we will branch to if we have enough stack
11311 space. We expect the basic block reordering pass to reverse this
11312 branch if optimizing, so that we branch in the unlikely case. */
11313 label = gen_label_rtx ();
11315 /* We need to compare the stack pointer minus the frame size with
11316 the stack boundary in the TCB. The stack boundary always gives
11317 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
11318 can compare directly. Otherwise we need to do an addition. */
11320 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
11321 UNSPEC_STACK_CHECK);
11322 limit = gen_rtx_CONST (Pmode, limit);
11323 limit = gen_rtx_MEM (Pmode, limit);
11324 if (allocate < SPLIT_STACK_AVAILABLE)
11325 current = stack_pointer_rtx;
11326 else
11328 unsigned int scratch_regno;
11329 rtx offset;
11331 /* We need a scratch register to hold the stack pointer minus
11332 the required frame size. Since this is the very start of the
11333 function, the scratch register can be any caller-saved
11334 register which is not used for parameters. */
11335 offset = GEN_INT (- allocate);
11336 scratch_regno = split_stack_prologue_scratch_regno ();
11337 if (scratch_regno == INVALID_REGNUM)
11338 return;
11339 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11340 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11342 /* We don't use ix86_gen_add3 in this case because it will
11343 want to split to lea, but when not optimizing the insn
11344 will not be split after this point. */
11345 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11346 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11347 offset)));
11349 else
11351 emit_move_insn (scratch_reg, offset);
11352 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
11353 stack_pointer_rtx));
11355 current = scratch_reg;
11358 ix86_expand_branch (GEU, current, limit, label);
11359 jump_insn = get_last_insn ();
11360 JUMP_LABEL (jump_insn) = label;
11362 /* Mark the jump as very likely to be taken. */
11363 add_reg_note (jump_insn, REG_BR_PROB,
11364 GEN_INT (REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100));
11366 if (split_stack_fn == NULL_RTX)
11367 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11368 fn = split_stack_fn;
11370 /* Get more stack space. We pass in the desired stack space and the
11371 size of the arguments to copy to the new stack. In 32-bit mode
11372 we push the parameters; __morestack will return on a new stack
11373 anyhow. In 64-bit mode we pass the parameters in r10 and
11374 r11. */
11375 allocate_rtx = GEN_INT (allocate);
11376 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11377 call_fusage = NULL_RTX;
11378 if (TARGET_64BIT)
11380 rtx reg10, reg11;
11382 reg10 = gen_rtx_REG (Pmode, R10_REG);
11383 reg11 = gen_rtx_REG (Pmode, R11_REG);
11385 /* If this function uses a static chain, it will be in %r10.
11386 Preserve it across the call to __morestack. */
11387 if (DECL_STATIC_CHAIN (cfun->decl))
11389 rtx rax;
11391 rax = gen_rtx_REG (word_mode, AX_REG);
11392 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
11393 use_reg (&call_fusage, rax);
11396 if (ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11398 HOST_WIDE_INT argval;
11400 gcc_assert (Pmode == DImode);
11401 /* When using the large model we need to load the address
11402 into a register, and we've run out of registers. So we
11403 switch to a different calling convention, and we call a
11404 different function: __morestack_large. We pass the
11405 argument size in the upper 32 bits of r10 and pass the
11406 frame size in the lower 32 bits. */
11407 gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11408 gcc_assert ((args_size & 0xffffffff) == args_size);
11410 if (split_stack_fn_large == NULL_RTX)
11411 split_stack_fn_large =
11412 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11414 if (ix86_cmodel == CM_LARGE_PIC)
11416 rtx label, x;
11418 label = gen_label_rtx ();
11419 emit_label (label);
11420 LABEL_PRESERVE_P (label) = 1;
11421 emit_insn (gen_set_rip_rex64 (reg10, label));
11422 emit_insn (gen_set_got_offset_rex64 (reg11, label));
11423 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
11424 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11425 UNSPEC_GOT);
11426 x = gen_rtx_CONST (Pmode, x);
11427 emit_move_insn (reg11, x);
11428 x = gen_rtx_PLUS (Pmode, reg10, reg11);
11429 x = gen_const_mem (Pmode, x);
11430 emit_move_insn (reg11, x);
11432 else
11433 emit_move_insn (reg11, split_stack_fn_large);
11435 fn = reg11;
11437 argval = ((args_size << 16) << 16) + allocate;
11438 emit_move_insn (reg10, GEN_INT (argval));
11440 else
11442 emit_move_insn (reg10, allocate_rtx);
11443 emit_move_insn (reg11, GEN_INT (args_size));
11444 use_reg (&call_fusage, reg11);
11447 use_reg (&call_fusage, reg10);
11449 else
11451 emit_insn (gen_push (GEN_INT (args_size)));
11452 emit_insn (gen_push (allocate_rtx));
11454 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11455 GEN_INT (UNITS_PER_WORD), constm1_rtx,
11456 NULL_RTX, false);
11457 add_function_usage_to (call_insn, call_fusage);
11459 /* In order to make call/return prediction work right, we now need
11460 to execute a return instruction. See
11461 libgcc/config/i386/morestack.S for the details on how this works.
11463 For flow purposes gcc must not see this as a return
11464 instruction--we need control flow to continue at the subsequent
11465 label. Therefore, we use an unspec. */
11466 gcc_assert (crtl->args.pops_args < 65536);
11467 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11469 /* If we are in 64-bit mode and this function uses a static chain,
11470 we saved %r10 in %rax before calling _morestack. */
11471 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11472 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
11473 gen_rtx_REG (word_mode, AX_REG));
11475 /* If this function calls va_start, we need to store a pointer to
11476 the arguments on the old stack, because they may not have been
11477 all copied to the new stack. At this point the old stack can be
11478 found at the frame pointer value used by __morestack, because
11479 __morestack has set that up before calling back to us. Here we
11480 store that pointer in a scratch register, and in
11481 ix86_expand_prologue we store the scratch register in a stack
11482 slot. */
11483 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11485 unsigned int scratch_regno;
11486 rtx frame_reg;
11487 int words;
11489 scratch_regno = split_stack_prologue_scratch_regno ();
11490 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11491 frame_reg = gen_rtx_REG (Pmode, BP_REG);
11493 /* 64-bit:
11494 fp -> old fp value
11495 return address within this function
11496 return address of caller of this function
11497 stack arguments
11498 So we add three words to get to the stack arguments.
11500 32-bit:
11501 fp -> old fp value
11502 return address within this function
11503 first argument to __morestack
11504 second argument to __morestack
11505 return address of caller of this function
11506 stack arguments
11507 So we add five words to get to the stack arguments.
11509 words = TARGET_64BIT ? 3 : 5;
11510 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11511 gen_rtx_PLUS (Pmode, frame_reg,
11512 GEN_INT (words * UNITS_PER_WORD))));
11514 varargs_label = gen_label_rtx ();
11515 emit_jump_insn (gen_jump (varargs_label));
11516 JUMP_LABEL (get_last_insn ()) = varargs_label;
11518 emit_barrier ();
11521 emit_label (label);
11522 LABEL_NUSES (label) = 1;
11524 /* If this function calls va_start, we now have to set the scratch
11525 register for the case where we do not call __morestack. In this
11526 case we need to set it based on the stack pointer. */
11527 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11529 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11530 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11531 GEN_INT (UNITS_PER_WORD))));
11533 emit_label (varargs_label);
11534 LABEL_NUSES (varargs_label) = 1;
11538 /* We may have to tell the dataflow pass that the split stack prologue
11539 is initializing a scratch register. */
11541 static void
11542 ix86_live_on_entry (bitmap regs)
11544 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11546 gcc_assert (flag_split_stack);
11547 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11551 /* Determine if op is suitable SUBREG RTX for address. */
11553 static bool
11554 ix86_address_subreg_operand (rtx op)
11556 enum machine_mode mode;
11558 if (!REG_P (op))
11559 return false;
11561 mode = GET_MODE (op);
11563 if (GET_MODE_CLASS (mode) != MODE_INT)
11564 return false;
11566 /* Don't allow SUBREGs that span more than a word. It can lead to spill
11567 failures when the register is one word out of a two word structure. */
11568 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
11569 return false;
11571 /* Allow only SUBREGs of non-eliminable hard registers. */
11572 return register_no_elim_operand (op, mode);
11575 /* Extract the parts of an RTL expression that is a valid memory address
11576 for an instruction. Return 0 if the structure of the address is
11577 grossly off. Return -1 if the address contains ASHIFT, so it is not
11578 strictly valid, but still used for computing length of lea instruction. */
11581 ix86_decompose_address (rtx addr, struct ix86_address *out)
11583 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11584 rtx base_reg, index_reg;
11585 HOST_WIDE_INT scale = 1;
11586 rtx scale_rtx = NULL_RTX;
11587 rtx tmp;
11588 int retval = 1;
11589 enum ix86_address_seg seg = SEG_DEFAULT;
11591 /* Allow zero-extended SImode addresses,
11592 they will be emitted with addr32 prefix. */
11593 if (TARGET_64BIT && GET_MODE (addr) == DImode)
11595 if (GET_CODE (addr) == ZERO_EXTEND
11596 && GET_MODE (XEXP (addr, 0)) == SImode)
11598 addr = XEXP (addr, 0);
11599 if (CONST_INT_P (addr))
11600 return 0;
11602 else if (GET_CODE (addr) == AND
11603 && const_32bit_mask (XEXP (addr, 1), DImode))
11605 addr = simplify_gen_subreg (SImode, XEXP (addr, 0), DImode, 0);
11606 if (addr == NULL_RTX)
11607 return 0;
11609 if (CONST_INT_P (addr))
11610 return 0;
11614 /* Allow SImode subregs of DImode addresses,
11615 they will be emitted with addr32 prefix. */
11616 if (TARGET_64BIT && GET_MODE (addr) == SImode)
11618 if (GET_CODE (addr) == SUBREG
11619 && GET_MODE (SUBREG_REG (addr)) == DImode)
11621 addr = SUBREG_REG (addr);
11622 if (CONST_INT_P (addr))
11623 return 0;
11627 if (REG_P (addr))
11628 base = addr;
11629 else if (GET_CODE (addr) == SUBREG)
11631 if (ix86_address_subreg_operand (SUBREG_REG (addr)))
11632 base = addr;
11633 else
11634 return 0;
11636 else if (GET_CODE (addr) == PLUS)
11638 rtx addends[4], op;
11639 int n = 0, i;
11641 op = addr;
11644 if (n >= 4)
11645 return 0;
11646 addends[n++] = XEXP (op, 1);
11647 op = XEXP (op, 0);
11649 while (GET_CODE (op) == PLUS);
11650 if (n >= 4)
11651 return 0;
11652 addends[n] = op;
11654 for (i = n; i >= 0; --i)
11656 op = addends[i];
11657 switch (GET_CODE (op))
11659 case MULT:
11660 if (index)
11661 return 0;
11662 index = XEXP (op, 0);
11663 scale_rtx = XEXP (op, 1);
11664 break;
11666 case ASHIFT:
11667 if (index)
11668 return 0;
11669 index = XEXP (op, 0);
11670 tmp = XEXP (op, 1);
11671 if (!CONST_INT_P (tmp))
11672 return 0;
11673 scale = INTVAL (tmp);
11674 if ((unsigned HOST_WIDE_INT) scale > 3)
11675 return 0;
11676 scale = 1 << scale;
11677 break;
11679 case ZERO_EXTEND:
11680 op = XEXP (op, 0);
11681 if (GET_CODE (op) != UNSPEC)
11682 return 0;
11683 /* FALLTHRU */
11685 case UNSPEC:
11686 if (XINT (op, 1) == UNSPEC_TP
11687 && TARGET_TLS_DIRECT_SEG_REFS
11688 && seg == SEG_DEFAULT)
11689 seg = TARGET_64BIT ? SEG_FS : SEG_GS;
11690 else
11691 return 0;
11692 break;
11694 case SUBREG:
11695 if (!ix86_address_subreg_operand (SUBREG_REG (op)))
11696 return 0;
11697 /* FALLTHRU */
11699 case REG:
11700 if (!base)
11701 base = op;
11702 else if (!index)
11703 index = op;
11704 else
11705 return 0;
11706 break;
11708 case CONST:
11709 case CONST_INT:
11710 case SYMBOL_REF:
11711 case LABEL_REF:
11712 if (disp)
11713 return 0;
11714 disp = op;
11715 break;
11717 default:
11718 return 0;
11722 else if (GET_CODE (addr) == MULT)
11724 index = XEXP (addr, 0); /* index*scale */
11725 scale_rtx = XEXP (addr, 1);
11727 else if (GET_CODE (addr) == ASHIFT)
11729 /* We're called for lea too, which implements ashift on occasion. */
11730 index = XEXP (addr, 0);
11731 tmp = XEXP (addr, 1);
11732 if (!CONST_INT_P (tmp))
11733 return 0;
11734 scale = INTVAL (tmp);
11735 if ((unsigned HOST_WIDE_INT) scale > 3)
11736 return 0;
11737 scale = 1 << scale;
11738 retval = -1;
11740 else if (CONST_INT_P (addr))
11742 if (!x86_64_immediate_operand (addr, VOIDmode))
11743 return 0;
11745 /* Constant addresses are sign extended to 64bit, we have to
11746 prevent addresses from 0x80000000 to 0xffffffff in x32 mode. */
11747 if (TARGET_X32
11748 && val_signbit_known_set_p (SImode, INTVAL (addr)))
11749 return 0;
11751 disp = addr;
11753 else
11754 disp = addr; /* displacement */
11756 if (index)
11758 if (REG_P (index))
11760 else if (GET_CODE (index) == SUBREG
11761 && ix86_address_subreg_operand (SUBREG_REG (index)))
11763 else
11764 return 0;
11767 /* Address override works only on the (%reg) part of %fs:(%reg). */
11768 if (seg != SEG_DEFAULT
11769 && ((base && GET_MODE (base) != word_mode)
11770 || (index && GET_MODE (index) != word_mode)))
11771 return 0;
11773 /* Extract the integral value of scale. */
11774 if (scale_rtx)
11776 if (!CONST_INT_P (scale_rtx))
11777 return 0;
11778 scale = INTVAL (scale_rtx);
11781 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
11782 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
11784 /* Avoid useless 0 displacement. */
11785 if (disp == const0_rtx && (base || index))
11786 disp = NULL_RTX;
11788 /* Allow arg pointer and stack pointer as index if there is not scaling. */
11789 if (base_reg && index_reg && scale == 1
11790 && (index_reg == arg_pointer_rtx
11791 || index_reg == frame_pointer_rtx
11792 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
11794 rtx tmp;
11795 tmp = base, base = index, index = tmp;
11796 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
11799 /* Special case: %ebp cannot be encoded as a base without a displacement.
11800 Similarly %r13. */
11801 if (!disp
11802 && base_reg
11803 && (base_reg == hard_frame_pointer_rtx
11804 || base_reg == frame_pointer_rtx
11805 || base_reg == arg_pointer_rtx
11806 || (REG_P (base_reg)
11807 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
11808 || REGNO (base_reg) == R13_REG))))
11809 disp = const0_rtx;
11811 /* Special case: on K6, [%esi] makes the instruction vector decoded.
11812 Avoid this by transforming to [%esi+0].
11813 Reload calls address legitimization without cfun defined, so we need
11814 to test cfun for being non-NULL. */
11815 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
11816 && base_reg && !index_reg && !disp
11817 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
11818 disp = const0_rtx;
11820 /* Special case: encode reg+reg instead of reg*2. */
11821 if (!base && index && scale == 2)
11822 base = index, base_reg = index_reg, scale = 1;
11824 /* Special case: scaling cannot be encoded without base or displacement. */
11825 if (!base && !disp && index && scale != 1)
11826 disp = const0_rtx;
11828 out->base = base;
11829 out->index = index;
11830 out->disp = disp;
11831 out->scale = scale;
11832 out->seg = seg;
11834 return retval;
11837 /* Return cost of the memory address x.
11838 For i386, it is better to use a complex address than let gcc copy
11839 the address into a reg and make a new pseudo. But not if the address
11840 requires to two regs - that would mean more pseudos with longer
11841 lifetimes. */
11842 static int
11843 ix86_address_cost (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED,
11844 addr_space_t as ATTRIBUTE_UNUSED,
11845 bool speed ATTRIBUTE_UNUSED)
11847 struct ix86_address parts;
11848 int cost = 1;
11849 int ok = ix86_decompose_address (x, &parts);
11851 gcc_assert (ok);
11853 if (parts.base && GET_CODE (parts.base) == SUBREG)
11854 parts.base = SUBREG_REG (parts.base);
11855 if (parts.index && GET_CODE (parts.index) == SUBREG)
11856 parts.index = SUBREG_REG (parts.index);
11858 /* Attempt to minimize number of registers in the address. */
11859 if ((parts.base
11860 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
11861 || (parts.index
11862 && (!REG_P (parts.index)
11863 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
11864 cost++;
11866 if (parts.base
11867 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
11868 && parts.index
11869 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
11870 && parts.base != parts.index)
11871 cost++;
11873 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
11874 since it's predecode logic can't detect the length of instructions
11875 and it degenerates to vector decoded. Increase cost of such
11876 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
11877 to split such addresses or even refuse such addresses at all.
11879 Following addressing modes are affected:
11880 [base+scale*index]
11881 [scale*index+disp]
11882 [base+index]
11884 The first and last case may be avoidable by explicitly coding the zero in
11885 memory address, but I don't have AMD-K6 machine handy to check this
11886 theory. */
11888 if (TARGET_K6
11889 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
11890 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
11891 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
11892 cost += 10;
11894 return cost;
11897 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
11898 this is used for to form addresses to local data when -fPIC is in
11899 use. */
11901 static bool
11902 darwin_local_data_pic (rtx disp)
11904 return (GET_CODE (disp) == UNSPEC
11905 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
11908 /* Determine if a given RTX is a valid constant. We already know this
11909 satisfies CONSTANT_P. */
11911 static bool
11912 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
11914 switch (GET_CODE (x))
11916 case CONST:
11917 x = XEXP (x, 0);
11919 if (GET_CODE (x) == PLUS)
11921 if (!CONST_INT_P (XEXP (x, 1)))
11922 return false;
11923 x = XEXP (x, 0);
11926 if (TARGET_MACHO && darwin_local_data_pic (x))
11927 return true;
11929 /* Only some unspecs are valid as "constants". */
11930 if (GET_CODE (x) == UNSPEC)
11931 switch (XINT (x, 1))
11933 case UNSPEC_GOT:
11934 case UNSPEC_GOTOFF:
11935 case UNSPEC_PLTOFF:
11936 return TARGET_64BIT;
11937 case UNSPEC_TPOFF:
11938 case UNSPEC_NTPOFF:
11939 x = XVECEXP (x, 0, 0);
11940 return (GET_CODE (x) == SYMBOL_REF
11941 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11942 case UNSPEC_DTPOFF:
11943 x = XVECEXP (x, 0, 0);
11944 return (GET_CODE (x) == SYMBOL_REF
11945 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
11946 default:
11947 return false;
11950 /* We must have drilled down to a symbol. */
11951 if (GET_CODE (x) == LABEL_REF)
11952 return true;
11953 if (GET_CODE (x) != SYMBOL_REF)
11954 return false;
11955 /* FALLTHRU */
11957 case SYMBOL_REF:
11958 /* TLS symbols are never valid. */
11959 if (SYMBOL_REF_TLS_MODEL (x))
11960 return false;
11962 /* DLLIMPORT symbols are never valid. */
11963 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
11964 && SYMBOL_REF_DLLIMPORT_P (x))
11965 return false;
11967 #if TARGET_MACHO
11968 /* mdynamic-no-pic */
11969 if (MACHO_DYNAMIC_NO_PIC_P)
11970 return machopic_symbol_defined_p (x);
11971 #endif
11972 break;
11974 case CONST_DOUBLE:
11975 if (GET_MODE (x) == TImode
11976 && x != CONST0_RTX (TImode)
11977 && !TARGET_64BIT)
11978 return false;
11979 break;
11981 case CONST_VECTOR:
11982 if (!standard_sse_constant_p (x))
11983 return false;
11985 default:
11986 break;
11989 /* Otherwise we handle everything else in the move patterns. */
11990 return true;
11993 /* Determine if it's legal to put X into the constant pool. This
11994 is not possible for the address of thread-local symbols, which
11995 is checked above. */
11997 static bool
11998 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
12000 /* We can always put integral constants and vectors in memory. */
12001 switch (GET_CODE (x))
12003 case CONST_INT:
12004 case CONST_DOUBLE:
12005 case CONST_VECTOR:
12006 return false;
12008 default:
12009 break;
12011 return !ix86_legitimate_constant_p (mode, x);
12015 /* Nonzero if the constant value X is a legitimate general operand
12016 when generating PIC code. It is given that flag_pic is on and
12017 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
12019 bool
12020 legitimate_pic_operand_p (rtx x)
12022 rtx inner;
12024 switch (GET_CODE (x))
12026 case CONST:
12027 inner = XEXP (x, 0);
12028 if (GET_CODE (inner) == PLUS
12029 && CONST_INT_P (XEXP (inner, 1)))
12030 inner = XEXP (inner, 0);
12032 /* Only some unspecs are valid as "constants". */
12033 if (GET_CODE (inner) == UNSPEC)
12034 switch (XINT (inner, 1))
12036 case UNSPEC_GOT:
12037 case UNSPEC_GOTOFF:
12038 case UNSPEC_PLTOFF:
12039 return TARGET_64BIT;
12040 case UNSPEC_TPOFF:
12041 x = XVECEXP (inner, 0, 0);
12042 return (GET_CODE (x) == SYMBOL_REF
12043 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
12044 case UNSPEC_MACHOPIC_OFFSET:
12045 return legitimate_pic_address_disp_p (x);
12046 default:
12047 return false;
12049 /* FALLTHRU */
12051 case SYMBOL_REF:
12052 case LABEL_REF:
12053 return legitimate_pic_address_disp_p (x);
12055 default:
12056 return true;
12060 /* Determine if a given CONST RTX is a valid memory displacement
12061 in PIC mode. */
12063 bool
12064 legitimate_pic_address_disp_p (rtx disp)
12066 bool saw_plus;
12068 /* In 64bit mode we can allow direct addresses of symbols and labels
12069 when they are not dynamic symbols. */
12070 if (TARGET_64BIT)
12072 rtx op0 = disp, op1;
12074 switch (GET_CODE (disp))
12076 case LABEL_REF:
12077 return true;
12079 case CONST:
12080 if (GET_CODE (XEXP (disp, 0)) != PLUS)
12081 break;
12082 op0 = XEXP (XEXP (disp, 0), 0);
12083 op1 = XEXP (XEXP (disp, 0), 1);
12084 if (!CONST_INT_P (op1)
12085 || INTVAL (op1) >= 16*1024*1024
12086 || INTVAL (op1) < -16*1024*1024)
12087 break;
12088 if (GET_CODE (op0) == LABEL_REF)
12089 return true;
12090 if (GET_CODE (op0) == CONST
12091 && GET_CODE (XEXP (op0, 0)) == UNSPEC
12092 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
12093 return true;
12094 if (GET_CODE (op0) == UNSPEC
12095 && XINT (op0, 1) == UNSPEC_PCREL)
12096 return true;
12097 if (GET_CODE (op0) != SYMBOL_REF)
12098 break;
12099 /* FALLTHRU */
12101 case SYMBOL_REF:
12102 /* TLS references should always be enclosed in UNSPEC. */
12103 if (SYMBOL_REF_TLS_MODEL (op0))
12104 return false;
12105 if (!SYMBOL_REF_FAR_ADDR_P (op0) && SYMBOL_REF_LOCAL_P (op0)
12106 && ix86_cmodel != CM_LARGE_PIC)
12107 return true;
12108 break;
12110 default:
12111 break;
12114 if (GET_CODE (disp) != CONST)
12115 return false;
12116 disp = XEXP (disp, 0);
12118 if (TARGET_64BIT)
12120 /* We are unsafe to allow PLUS expressions. This limit allowed distance
12121 of GOT tables. We should not need these anyway. */
12122 if (GET_CODE (disp) != UNSPEC
12123 || (XINT (disp, 1) != UNSPEC_GOTPCREL
12124 && XINT (disp, 1) != UNSPEC_GOTOFF
12125 && XINT (disp, 1) != UNSPEC_PCREL
12126 && XINT (disp, 1) != UNSPEC_PLTOFF))
12127 return false;
12129 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
12130 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
12131 return false;
12132 return true;
12135 saw_plus = false;
12136 if (GET_CODE (disp) == PLUS)
12138 if (!CONST_INT_P (XEXP (disp, 1)))
12139 return false;
12140 disp = XEXP (disp, 0);
12141 saw_plus = true;
12144 if (TARGET_MACHO && darwin_local_data_pic (disp))
12145 return true;
12147 if (GET_CODE (disp) != UNSPEC)
12148 return false;
12150 switch (XINT (disp, 1))
12152 case UNSPEC_GOT:
12153 if (saw_plus)
12154 return false;
12155 /* We need to check for both symbols and labels because VxWorks loads
12156 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
12157 details. */
12158 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12159 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
12160 case UNSPEC_GOTOFF:
12161 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
12162 While ABI specify also 32bit relocation but we don't produce it in
12163 small PIC model at all. */
12164 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12165 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
12166 && !TARGET_64BIT)
12167 return gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
12168 return false;
12169 case UNSPEC_GOTTPOFF:
12170 case UNSPEC_GOTNTPOFF:
12171 case UNSPEC_INDNTPOFF:
12172 if (saw_plus)
12173 return false;
12174 disp = XVECEXP (disp, 0, 0);
12175 return (GET_CODE (disp) == SYMBOL_REF
12176 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
12177 case UNSPEC_NTPOFF:
12178 disp = XVECEXP (disp, 0, 0);
12179 return (GET_CODE (disp) == SYMBOL_REF
12180 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
12181 case UNSPEC_DTPOFF:
12182 disp = XVECEXP (disp, 0, 0);
12183 return (GET_CODE (disp) == SYMBOL_REF
12184 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
12187 return false;
12190 /* Our implementation of LEGITIMIZE_RELOAD_ADDRESS. Returns a value to
12191 replace the input X, or the original X if no replacement is called for.
12192 The output parameter *WIN is 1 if the calling macro should goto WIN,
12193 0 if it should not. */
12195 bool
12196 ix86_legitimize_reload_address (rtx x,
12197 enum machine_mode mode ATTRIBUTE_UNUSED,
12198 int opnum, int type,
12199 int ind_levels ATTRIBUTE_UNUSED)
12201 /* Reload can generate:
12203 (plus:DI (plus:DI (unspec:DI [(const_int 0 [0])] UNSPEC_TP)
12204 (reg:DI 97))
12205 (reg:DI 2 cx))
12207 This RTX is rejected from ix86_legitimate_address_p due to
12208 non-strictness of base register 97. Following this rejection,
12209 reload pushes all three components into separate registers,
12210 creating invalid memory address RTX.
12212 Following code reloads only the invalid part of the
12213 memory address RTX. */
12215 if (GET_CODE (x) == PLUS
12216 && REG_P (XEXP (x, 1))
12217 && GET_CODE (XEXP (x, 0)) == PLUS
12218 && REG_P (XEXP (XEXP (x, 0), 1)))
12220 rtx base, index;
12221 bool something_reloaded = false;
12223 base = XEXP (XEXP (x, 0), 1);
12224 if (!REG_OK_FOR_BASE_STRICT_P (base))
12226 push_reload (base, NULL_RTX, &XEXP (XEXP (x, 0), 1), NULL,
12227 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12228 opnum, (enum reload_type) type);
12229 something_reloaded = true;
12232 index = XEXP (x, 1);
12233 if (!REG_OK_FOR_INDEX_STRICT_P (index))
12235 push_reload (index, NULL_RTX, &XEXP (x, 1), NULL,
12236 INDEX_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12237 opnum, (enum reload_type) type);
12238 something_reloaded = true;
12241 gcc_assert (something_reloaded);
12242 return true;
12245 return false;
12248 /* Recognizes RTL expressions that are valid memory addresses for an
12249 instruction. The MODE argument is the machine mode for the MEM
12250 expression that wants to use this address.
12252 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
12253 convert common non-canonical forms to canonical form so that they will
12254 be recognized. */
12256 static bool
12257 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
12258 rtx addr, bool strict)
12260 struct ix86_address parts;
12261 rtx base, index, disp;
12262 HOST_WIDE_INT scale;
12264 if (ix86_decompose_address (addr, &parts) <= 0)
12265 /* Decomposition failed. */
12266 return false;
12268 base = parts.base;
12269 index = parts.index;
12270 disp = parts.disp;
12271 scale = parts.scale;
12273 /* Validate base register. */
12274 if (base)
12276 rtx reg;
12278 if (REG_P (base))
12279 reg = base;
12280 else if (GET_CODE (base) == SUBREG && REG_P (SUBREG_REG (base)))
12281 reg = SUBREG_REG (base);
12282 else
12283 /* Base is not a register. */
12284 return false;
12286 if (GET_MODE (base) != SImode && GET_MODE (base) != DImode)
12287 return false;
12289 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
12290 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
12291 /* Base is not valid. */
12292 return false;
12295 /* Validate index register. */
12296 if (index)
12298 rtx reg;
12300 if (REG_P (index))
12301 reg = index;
12302 else if (GET_CODE (index) == SUBREG && REG_P (SUBREG_REG (index)))
12303 reg = SUBREG_REG (index);
12304 else
12305 /* Index is not a register. */
12306 return false;
12308 if (GET_MODE (index) != SImode && GET_MODE (index) != DImode)
12309 return false;
12311 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
12312 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
12313 /* Index is not valid. */
12314 return false;
12317 /* Index and base should have the same mode. */
12318 if (base && index
12319 && GET_MODE (base) != GET_MODE (index))
12320 return false;
12322 /* Validate scale factor. */
12323 if (scale != 1)
12325 if (!index)
12326 /* Scale without index. */
12327 return false;
12329 if (scale != 2 && scale != 4 && scale != 8)
12330 /* Scale is not a valid multiplier. */
12331 return false;
12334 /* Validate displacement. */
12335 if (disp)
12337 if (GET_CODE (disp) == CONST
12338 && GET_CODE (XEXP (disp, 0)) == UNSPEC
12339 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
12340 switch (XINT (XEXP (disp, 0), 1))
12342 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
12343 used. While ABI specify also 32bit relocations, we don't produce
12344 them at all and use IP relative instead. */
12345 case UNSPEC_GOT:
12346 case UNSPEC_GOTOFF:
12347 gcc_assert (flag_pic);
12348 if (!TARGET_64BIT)
12349 goto is_legitimate_pic;
12351 /* 64bit address unspec. */
12352 return false;
12354 case UNSPEC_GOTPCREL:
12355 case UNSPEC_PCREL:
12356 gcc_assert (flag_pic);
12357 goto is_legitimate_pic;
12359 case UNSPEC_GOTTPOFF:
12360 case UNSPEC_GOTNTPOFF:
12361 case UNSPEC_INDNTPOFF:
12362 case UNSPEC_NTPOFF:
12363 case UNSPEC_DTPOFF:
12364 break;
12366 case UNSPEC_STACK_CHECK:
12367 gcc_assert (flag_split_stack);
12368 break;
12370 default:
12371 /* Invalid address unspec. */
12372 return false;
12375 else if (SYMBOLIC_CONST (disp)
12376 && (flag_pic
12377 || (TARGET_MACHO
12378 #if TARGET_MACHO
12379 && MACHOPIC_INDIRECT
12380 && !machopic_operand_p (disp)
12381 #endif
12385 is_legitimate_pic:
12386 if (TARGET_64BIT && (index || base))
12388 /* foo@dtpoff(%rX) is ok. */
12389 if (GET_CODE (disp) != CONST
12390 || GET_CODE (XEXP (disp, 0)) != PLUS
12391 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
12392 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
12393 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
12394 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
12395 /* Non-constant pic memory reference. */
12396 return false;
12398 else if ((!TARGET_MACHO || flag_pic)
12399 && ! legitimate_pic_address_disp_p (disp))
12400 /* Displacement is an invalid pic construct. */
12401 return false;
12402 #if TARGET_MACHO
12403 else if (MACHO_DYNAMIC_NO_PIC_P
12404 && !ix86_legitimate_constant_p (Pmode, disp))
12405 /* displacment must be referenced via non_lazy_pointer */
12406 return false;
12407 #endif
12409 /* This code used to verify that a symbolic pic displacement
12410 includes the pic_offset_table_rtx register.
12412 While this is good idea, unfortunately these constructs may
12413 be created by "adds using lea" optimization for incorrect
12414 code like:
12416 int a;
12417 int foo(int i)
12419 return *(&a+i);
12422 This code is nonsensical, but results in addressing
12423 GOT table with pic_offset_table_rtx base. We can't
12424 just refuse it easily, since it gets matched by
12425 "addsi3" pattern, that later gets split to lea in the
12426 case output register differs from input. While this
12427 can be handled by separate addsi pattern for this case
12428 that never results in lea, this seems to be easier and
12429 correct fix for crash to disable this test. */
12431 else if (GET_CODE (disp) != LABEL_REF
12432 && !CONST_INT_P (disp)
12433 && (GET_CODE (disp) != CONST
12434 || !ix86_legitimate_constant_p (Pmode, disp))
12435 && (GET_CODE (disp) != SYMBOL_REF
12436 || !ix86_legitimate_constant_p (Pmode, disp)))
12437 /* Displacement is not constant. */
12438 return false;
12439 else if (TARGET_64BIT
12440 && !x86_64_immediate_operand (disp, VOIDmode))
12441 /* Displacement is out of range. */
12442 return false;
12445 /* Everything looks valid. */
12446 return true;
12449 /* Determine if a given RTX is a valid constant address. */
12451 bool
12452 constant_address_p (rtx x)
12454 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12457 /* Return a unique alias set for the GOT. */
12459 static alias_set_type
12460 ix86_GOT_alias_set (void)
12462 static alias_set_type set = -1;
12463 if (set == -1)
12464 set = new_alias_set ();
12465 return set;
12468 /* Return a legitimate reference for ORIG (an address) using the
12469 register REG. If REG is 0, a new pseudo is generated.
12471 There are two types of references that must be handled:
12473 1. Global data references must load the address from the GOT, via
12474 the PIC reg. An insn is emitted to do this load, and the reg is
12475 returned.
12477 2. Static data references, constant pool addresses, and code labels
12478 compute the address as an offset from the GOT, whose base is in
12479 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
12480 differentiate them from global data objects. The returned
12481 address is the PIC reg + an unspec constant.
12483 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12484 reg also appears in the address. */
12486 static rtx
12487 legitimize_pic_address (rtx orig, rtx reg)
12489 rtx addr = orig;
12490 rtx new_rtx = orig;
12492 #if TARGET_MACHO
12493 if (TARGET_MACHO && !TARGET_64BIT)
12495 if (reg == 0)
12496 reg = gen_reg_rtx (Pmode);
12497 /* Use the generic Mach-O PIC machinery. */
12498 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12500 #endif
12502 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12503 new_rtx = addr;
12504 else if (TARGET_64BIT
12505 && ix86_cmodel != CM_SMALL_PIC
12506 && gotoff_operand (addr, Pmode))
12508 rtx tmpreg;
12509 /* This symbol may be referenced via a displacement from the PIC
12510 base address (@GOTOFF). */
12512 if (reload_in_progress)
12513 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12514 if (GET_CODE (addr) == CONST)
12515 addr = XEXP (addr, 0);
12516 if (GET_CODE (addr) == PLUS)
12518 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12519 UNSPEC_GOTOFF);
12520 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12522 else
12523 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12524 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12525 if (!reg)
12526 tmpreg = gen_reg_rtx (Pmode);
12527 else
12528 tmpreg = reg;
12529 emit_move_insn (tmpreg, new_rtx);
12531 if (reg != 0)
12533 new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12534 tmpreg, 1, OPTAB_DIRECT);
12535 new_rtx = reg;
12537 else new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
12539 else if (!TARGET_64BIT && gotoff_operand (addr, Pmode))
12541 /* This symbol may be referenced via a displacement from the PIC
12542 base address (@GOTOFF). */
12544 if (reload_in_progress)
12545 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12546 if (GET_CODE (addr) == CONST)
12547 addr = XEXP (addr, 0);
12548 if (GET_CODE (addr) == PLUS)
12550 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12551 UNSPEC_GOTOFF);
12552 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12554 else
12555 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12556 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12557 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12559 if (reg != 0)
12561 emit_move_insn (reg, new_rtx);
12562 new_rtx = reg;
12565 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
12566 /* We can't use @GOTOFF for text labels on VxWorks;
12567 see gotoff_operand. */
12568 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
12570 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12572 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
12573 return legitimize_dllimport_symbol (addr, true);
12574 if (GET_CODE (addr) == CONST && GET_CODE (XEXP (addr, 0)) == PLUS
12575 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
12576 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
12578 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), true);
12579 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
12583 /* For x64 PE-COFF there is no GOT table. So we use address
12584 directly. */
12585 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
12587 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
12588 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12590 if (reg == 0)
12591 reg = gen_reg_rtx (Pmode);
12592 emit_move_insn (reg, new_rtx);
12593 new_rtx = reg;
12595 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
12597 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
12598 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12599 new_rtx = gen_const_mem (Pmode, new_rtx);
12600 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12602 if (reg == 0)
12603 reg = gen_reg_rtx (Pmode);
12604 /* Use directly gen_movsi, otherwise the address is loaded
12605 into register for CSE. We don't want to CSE this addresses,
12606 instead we CSE addresses from the GOT table, so skip this. */
12607 emit_insn (gen_movsi (reg, new_rtx));
12608 new_rtx = reg;
12610 else
12612 /* This symbol must be referenced via a load from the
12613 Global Offset Table (@GOT). */
12615 if (reload_in_progress)
12616 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12617 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
12618 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12619 if (TARGET_64BIT)
12620 new_rtx = force_reg (Pmode, new_rtx);
12621 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12622 new_rtx = gen_const_mem (Pmode, new_rtx);
12623 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12625 if (reg == 0)
12626 reg = gen_reg_rtx (Pmode);
12627 emit_move_insn (reg, new_rtx);
12628 new_rtx = reg;
12631 else
12633 if (CONST_INT_P (addr)
12634 && !x86_64_immediate_operand (addr, VOIDmode))
12636 if (reg)
12638 emit_move_insn (reg, addr);
12639 new_rtx = reg;
12641 else
12642 new_rtx = force_reg (Pmode, addr);
12644 else if (GET_CODE (addr) == CONST)
12646 addr = XEXP (addr, 0);
12648 /* We must match stuff we generate before. Assume the only
12649 unspecs that can get here are ours. Not that we could do
12650 anything with them anyway.... */
12651 if (GET_CODE (addr) == UNSPEC
12652 || (GET_CODE (addr) == PLUS
12653 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
12654 return orig;
12655 gcc_assert (GET_CODE (addr) == PLUS);
12657 if (GET_CODE (addr) == PLUS)
12659 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
12661 /* Check first to see if this is a constant offset from a @GOTOFF
12662 symbol reference. */
12663 if (gotoff_operand (op0, Pmode)
12664 && CONST_INT_P (op1))
12666 if (!TARGET_64BIT)
12668 if (reload_in_progress)
12669 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12670 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
12671 UNSPEC_GOTOFF);
12672 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
12673 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12674 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12676 if (reg != 0)
12678 emit_move_insn (reg, new_rtx);
12679 new_rtx = reg;
12682 else
12684 if (INTVAL (op1) < -16*1024*1024
12685 || INTVAL (op1) >= 16*1024*1024)
12687 if (!x86_64_immediate_operand (op1, Pmode))
12688 op1 = force_reg (Pmode, op1);
12689 new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
12693 else
12695 rtx base = legitimize_pic_address (op0, reg);
12696 enum machine_mode mode = GET_MODE (base);
12697 new_rtx
12698 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
12700 if (CONST_INT_P (new_rtx))
12702 if (INTVAL (new_rtx) < -16*1024*1024
12703 || INTVAL (new_rtx) >= 16*1024*1024)
12705 if (!x86_64_immediate_operand (new_rtx, mode))
12706 new_rtx = force_reg (mode, new_rtx);
12707 new_rtx
12708 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
12710 else
12711 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
12713 else
12715 if (GET_CODE (new_rtx) == PLUS
12716 && CONSTANT_P (XEXP (new_rtx, 1)))
12718 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
12719 new_rtx = XEXP (new_rtx, 1);
12721 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
12726 return new_rtx;
12729 /* Load the thread pointer. If TO_REG is true, force it into a register. */
12731 static rtx
12732 get_thread_pointer (enum machine_mode tp_mode, bool to_reg)
12734 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
12736 if (GET_MODE (tp) != tp_mode)
12738 gcc_assert (GET_MODE (tp) == SImode);
12739 gcc_assert (tp_mode == DImode);
12741 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
12744 if (to_reg)
12745 tp = copy_to_mode_reg (tp_mode, tp);
12747 return tp;
12750 /* Construct the SYMBOL_REF for the tls_get_addr function. */
12752 static GTY(()) rtx ix86_tls_symbol;
12754 static rtx
12755 ix86_tls_get_addr (void)
12757 if (!ix86_tls_symbol)
12759 const char *sym
12760 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
12761 ? "___tls_get_addr" : "__tls_get_addr");
12763 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
12766 return ix86_tls_symbol;
12769 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
12771 static GTY(()) rtx ix86_tls_module_base_symbol;
12774 ix86_tls_module_base (void)
12776 if (!ix86_tls_module_base_symbol)
12778 ix86_tls_module_base_symbol
12779 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
12781 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
12782 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
12785 return ix86_tls_module_base_symbol;
12788 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
12789 false if we expect this to be used for a memory address and true if
12790 we expect to load the address into a register. */
12792 static rtx
12793 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
12795 rtx dest, base, off;
12796 rtx pic = NULL_RTX, tp = NULL_RTX;
12797 enum machine_mode tp_mode = Pmode;
12798 int type;
12800 switch (model)
12802 case TLS_MODEL_GLOBAL_DYNAMIC:
12803 dest = gen_reg_rtx (Pmode);
12805 if (!TARGET_64BIT)
12807 if (flag_pic)
12808 pic = pic_offset_table_rtx;
12809 else
12811 pic = gen_reg_rtx (Pmode);
12812 emit_insn (gen_set_got (pic));
12816 if (TARGET_GNU2_TLS)
12818 if (TARGET_64BIT)
12819 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
12820 else
12821 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
12823 tp = get_thread_pointer (Pmode, true);
12824 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
12826 if (GET_MODE (x) != Pmode)
12827 x = gen_rtx_ZERO_EXTEND (Pmode, x);
12829 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12831 else
12833 rtx caddr = ix86_tls_get_addr ();
12835 if (TARGET_64BIT)
12837 rtx rax = gen_rtx_REG (Pmode, AX_REG);
12838 rtx insns;
12840 start_sequence ();
12841 emit_call_insn
12842 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
12843 insns = get_insns ();
12844 end_sequence ();
12846 if (GET_MODE (x) != Pmode)
12847 x = gen_rtx_ZERO_EXTEND (Pmode, x);
12849 RTL_CONST_CALL_P (insns) = 1;
12850 emit_libcall_block (insns, dest, rax, x);
12852 else
12853 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
12855 break;
12857 case TLS_MODEL_LOCAL_DYNAMIC:
12858 base = gen_reg_rtx (Pmode);
12860 if (!TARGET_64BIT)
12862 if (flag_pic)
12863 pic = pic_offset_table_rtx;
12864 else
12866 pic = gen_reg_rtx (Pmode);
12867 emit_insn (gen_set_got (pic));
12871 if (TARGET_GNU2_TLS)
12873 rtx tmp = ix86_tls_module_base ();
12875 if (TARGET_64BIT)
12876 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
12877 else
12878 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
12880 tp = get_thread_pointer (Pmode, true);
12881 set_unique_reg_note (get_last_insn (), REG_EQUAL,
12882 gen_rtx_MINUS (Pmode, tmp, tp));
12884 else
12886 rtx caddr = ix86_tls_get_addr ();
12888 if (TARGET_64BIT)
12890 rtx rax = gen_rtx_REG (Pmode, AX_REG);
12891 rtx insns, eqv;
12893 start_sequence ();
12894 emit_call_insn
12895 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
12896 insns = get_insns ();
12897 end_sequence ();
12899 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
12900 share the LD_BASE result with other LD model accesses. */
12901 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
12902 UNSPEC_TLS_LD_BASE);
12904 RTL_CONST_CALL_P (insns) = 1;
12905 emit_libcall_block (insns, base, rax, eqv);
12907 else
12908 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
12911 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
12912 off = gen_rtx_CONST (Pmode, off);
12914 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
12916 if (TARGET_GNU2_TLS)
12918 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
12920 if (GET_MODE (x) != Pmode)
12921 x = gen_rtx_ZERO_EXTEND (Pmode, x);
12923 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12925 break;
12927 case TLS_MODEL_INITIAL_EXEC:
12928 if (TARGET_64BIT)
12930 if (TARGET_SUN_TLS && !TARGET_X32)
12932 /* The Sun linker took the AMD64 TLS spec literally
12933 and can only handle %rax as destination of the
12934 initial executable code sequence. */
12936 dest = gen_reg_rtx (DImode);
12937 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
12938 return dest;
12941 /* Generate DImode references to avoid %fs:(%reg32)
12942 problems and linker IE->LE relaxation bug. */
12943 tp_mode = DImode;
12944 pic = NULL;
12945 type = UNSPEC_GOTNTPOFF;
12947 else if (flag_pic)
12949 if (reload_in_progress)
12950 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12951 pic = pic_offset_table_rtx;
12952 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
12954 else if (!TARGET_ANY_GNU_TLS)
12956 pic = gen_reg_rtx (Pmode);
12957 emit_insn (gen_set_got (pic));
12958 type = UNSPEC_GOTTPOFF;
12960 else
12962 pic = NULL;
12963 type = UNSPEC_INDNTPOFF;
12966 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
12967 off = gen_rtx_CONST (tp_mode, off);
12968 if (pic)
12969 off = gen_rtx_PLUS (tp_mode, pic, off);
12970 off = gen_const_mem (tp_mode, off);
12971 set_mem_alias_set (off, ix86_GOT_alias_set ());
12973 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12975 base = get_thread_pointer (tp_mode,
12976 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12977 off = force_reg (tp_mode, off);
12978 return gen_rtx_PLUS (tp_mode, base, off);
12980 else
12982 base = get_thread_pointer (Pmode, true);
12983 dest = gen_reg_rtx (Pmode);
12984 emit_insn (ix86_gen_sub3 (dest, base, off));
12986 break;
12988 case TLS_MODEL_LOCAL_EXEC:
12989 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
12990 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12991 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
12992 off = gen_rtx_CONST (Pmode, off);
12994 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12996 base = get_thread_pointer (Pmode,
12997 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12998 return gen_rtx_PLUS (Pmode, base, off);
13000 else
13002 base = get_thread_pointer (Pmode, true);
13003 dest = gen_reg_rtx (Pmode);
13004 emit_insn (ix86_gen_sub3 (dest, base, off));
13006 break;
13008 default:
13009 gcc_unreachable ();
13012 return dest;
13015 /* Create or return the unique __imp_DECL dllimport symbol corresponding
13016 to symbol DECL. */
13018 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
13019 htab_t dllimport_map;
13021 static tree
13022 get_dllimport_decl (tree decl)
13024 struct tree_map *h, in;
13025 void **loc;
13026 const char *name;
13027 const char *prefix;
13028 size_t namelen, prefixlen;
13029 char *imp_name;
13030 tree to;
13031 rtx rtl;
13033 if (!dllimport_map)
13034 dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
13036 in.hash = htab_hash_pointer (decl);
13037 in.base.from = decl;
13038 loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
13039 h = (struct tree_map *) *loc;
13040 if (h)
13041 return h->to;
13043 *loc = h = ggc_alloc_tree_map ();
13044 h->hash = in.hash;
13045 h->base.from = decl;
13046 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
13047 VAR_DECL, NULL, ptr_type_node);
13048 DECL_ARTIFICIAL (to) = 1;
13049 DECL_IGNORED_P (to) = 1;
13050 DECL_EXTERNAL (to) = 1;
13051 TREE_READONLY (to) = 1;
13053 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
13054 name = targetm.strip_name_encoding (name);
13055 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
13056 ? "*__imp_" : "*__imp__";
13057 namelen = strlen (name);
13058 prefixlen = strlen (prefix);
13059 imp_name = (char *) alloca (namelen + prefixlen + 1);
13060 memcpy (imp_name, prefix, prefixlen);
13061 memcpy (imp_name + prefixlen, name, namelen + 1);
13063 name = ggc_alloc_string (imp_name, namelen + prefixlen);
13064 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
13065 SET_SYMBOL_REF_DECL (rtl, to);
13066 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL;
13068 rtl = gen_const_mem (Pmode, rtl);
13069 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
13071 SET_DECL_RTL (to, rtl);
13072 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
13074 return to;
13077 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
13078 true if we require the result be a register. */
13080 static rtx
13081 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
13083 tree imp_decl;
13084 rtx x;
13086 gcc_assert (SYMBOL_REF_DECL (symbol));
13087 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol));
13089 x = DECL_RTL (imp_decl);
13090 if (want_reg)
13091 x = force_reg (Pmode, x);
13092 return x;
13095 /* Try machine-dependent ways of modifying an illegitimate address
13096 to be legitimate. If we find one, return the new, valid address.
13097 This macro is used in only one place: `memory_address' in explow.c.
13099 OLDX is the address as it was before break_out_memory_refs was called.
13100 In some cases it is useful to look at this to decide what needs to be done.
13102 It is always safe for this macro to do nothing. It exists to recognize
13103 opportunities to optimize the output.
13105 For the 80386, we handle X+REG by loading X into a register R and
13106 using R+REG. R will go in a general reg and indexing will be used.
13107 However, if REG is a broken-out memory address or multiplication,
13108 nothing needs to be done because REG can certainly go in a general reg.
13110 When -fpic is used, special handling is needed for symbolic references.
13111 See comments by legitimize_pic_address in i386.c for details. */
13113 static rtx
13114 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
13115 enum machine_mode mode)
13117 int changed = 0;
13118 unsigned log;
13120 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
13121 if (log)
13122 return legitimize_tls_address (x, (enum tls_model) log, false);
13123 if (GET_CODE (x) == CONST
13124 && GET_CODE (XEXP (x, 0)) == PLUS
13125 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13126 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
13128 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
13129 (enum tls_model) log, false);
13130 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13133 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
13135 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (x))
13136 return legitimize_dllimport_symbol (x, true);
13137 if (GET_CODE (x) == CONST
13138 && GET_CODE (XEXP (x, 0)) == PLUS
13139 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13140 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (x, 0), 0)))
13142 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (x, 0), 0), true);
13143 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13147 if (flag_pic && SYMBOLIC_CONST (x))
13148 return legitimize_pic_address (x, 0);
13150 #if TARGET_MACHO
13151 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
13152 return machopic_indirect_data_reference (x, 0);
13153 #endif
13155 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
13156 if (GET_CODE (x) == ASHIFT
13157 && CONST_INT_P (XEXP (x, 1))
13158 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
13160 changed = 1;
13161 log = INTVAL (XEXP (x, 1));
13162 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
13163 GEN_INT (1 << log));
13166 if (GET_CODE (x) == PLUS)
13168 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
13170 if (GET_CODE (XEXP (x, 0)) == ASHIFT
13171 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13172 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
13174 changed = 1;
13175 log = INTVAL (XEXP (XEXP (x, 0), 1));
13176 XEXP (x, 0) = gen_rtx_MULT (Pmode,
13177 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
13178 GEN_INT (1 << log));
13181 if (GET_CODE (XEXP (x, 1)) == ASHIFT
13182 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
13183 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
13185 changed = 1;
13186 log = INTVAL (XEXP (XEXP (x, 1), 1));
13187 XEXP (x, 1) = gen_rtx_MULT (Pmode,
13188 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
13189 GEN_INT (1 << log));
13192 /* Put multiply first if it isn't already. */
13193 if (GET_CODE (XEXP (x, 1)) == MULT)
13195 rtx tmp = XEXP (x, 0);
13196 XEXP (x, 0) = XEXP (x, 1);
13197 XEXP (x, 1) = tmp;
13198 changed = 1;
13201 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
13202 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
13203 created by virtual register instantiation, register elimination, and
13204 similar optimizations. */
13205 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
13207 changed = 1;
13208 x = gen_rtx_PLUS (Pmode,
13209 gen_rtx_PLUS (Pmode, XEXP (x, 0),
13210 XEXP (XEXP (x, 1), 0)),
13211 XEXP (XEXP (x, 1), 1));
13214 /* Canonicalize
13215 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
13216 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
13217 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
13218 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
13219 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
13220 && CONSTANT_P (XEXP (x, 1)))
13222 rtx constant;
13223 rtx other = NULL_RTX;
13225 if (CONST_INT_P (XEXP (x, 1)))
13227 constant = XEXP (x, 1);
13228 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
13230 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
13232 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
13233 other = XEXP (x, 1);
13235 else
13236 constant = 0;
13238 if (constant)
13240 changed = 1;
13241 x = gen_rtx_PLUS (Pmode,
13242 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
13243 XEXP (XEXP (XEXP (x, 0), 1), 0)),
13244 plus_constant (Pmode, other,
13245 INTVAL (constant)));
13249 if (changed && ix86_legitimate_address_p (mode, x, false))
13250 return x;
13252 if (GET_CODE (XEXP (x, 0)) == MULT)
13254 changed = 1;
13255 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
13258 if (GET_CODE (XEXP (x, 1)) == MULT)
13260 changed = 1;
13261 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
13264 if (changed
13265 && REG_P (XEXP (x, 1))
13266 && REG_P (XEXP (x, 0)))
13267 return x;
13269 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
13271 changed = 1;
13272 x = legitimize_pic_address (x, 0);
13275 if (changed && ix86_legitimate_address_p (mode, x, false))
13276 return x;
13278 if (REG_P (XEXP (x, 0)))
13280 rtx temp = gen_reg_rtx (Pmode);
13281 rtx val = force_operand (XEXP (x, 1), temp);
13282 if (val != temp)
13284 val = convert_to_mode (Pmode, val, 1);
13285 emit_move_insn (temp, val);
13288 XEXP (x, 1) = temp;
13289 return x;
13292 else if (REG_P (XEXP (x, 1)))
13294 rtx temp = gen_reg_rtx (Pmode);
13295 rtx val = force_operand (XEXP (x, 0), temp);
13296 if (val != temp)
13298 val = convert_to_mode (Pmode, val, 1);
13299 emit_move_insn (temp, val);
13302 XEXP (x, 0) = temp;
13303 return x;
13307 return x;
13310 /* Print an integer constant expression in assembler syntax. Addition
13311 and subtraction are the only arithmetic that may appear in these
13312 expressions. FILE is the stdio stream to write to, X is the rtx, and
13313 CODE is the operand print code from the output string. */
13315 static void
13316 output_pic_addr_const (FILE *file, rtx x, int code)
13318 char buf[256];
13320 switch (GET_CODE (x))
13322 case PC:
13323 gcc_assert (flag_pic);
13324 putc ('.', file);
13325 break;
13327 case SYMBOL_REF:
13328 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
13329 output_addr_const (file, x);
13330 else
13332 const char *name = XSTR (x, 0);
13334 /* Mark the decl as referenced so that cgraph will
13335 output the function. */
13336 if (SYMBOL_REF_DECL (x))
13337 mark_decl_referenced (SYMBOL_REF_DECL (x));
13339 #if TARGET_MACHO
13340 if (MACHOPIC_INDIRECT
13341 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
13342 name = machopic_indirection_name (x, /*stub_p=*/true);
13343 #endif
13344 assemble_name (file, name);
13346 if (!TARGET_MACHO && !(TARGET_64BIT && DEFAULT_ABI == MS_ABI)
13347 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
13348 fputs ("@PLT", file);
13349 break;
13351 case LABEL_REF:
13352 x = XEXP (x, 0);
13353 /* FALLTHRU */
13354 case CODE_LABEL:
13355 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
13356 assemble_name (asm_out_file, buf);
13357 break;
13359 case CONST_INT:
13360 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
13361 break;
13363 case CONST:
13364 /* This used to output parentheses around the expression,
13365 but that does not work on the 386 (either ATT or BSD assembler). */
13366 output_pic_addr_const (file, XEXP (x, 0), code);
13367 break;
13369 case CONST_DOUBLE:
13370 if (GET_MODE (x) == VOIDmode)
13372 /* We can use %d if the number is <32 bits and positive. */
13373 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
13374 fprintf (file, "0x%lx%08lx",
13375 (unsigned long) CONST_DOUBLE_HIGH (x),
13376 (unsigned long) CONST_DOUBLE_LOW (x));
13377 else
13378 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
13380 else
13381 /* We can't handle floating point constants;
13382 TARGET_PRINT_OPERAND must handle them. */
13383 output_operand_lossage ("floating constant misused");
13384 break;
13386 case PLUS:
13387 /* Some assemblers need integer constants to appear first. */
13388 if (CONST_INT_P (XEXP (x, 0)))
13390 output_pic_addr_const (file, XEXP (x, 0), code);
13391 putc ('+', file);
13392 output_pic_addr_const (file, XEXP (x, 1), code);
13394 else
13396 gcc_assert (CONST_INT_P (XEXP (x, 1)));
13397 output_pic_addr_const (file, XEXP (x, 1), code);
13398 putc ('+', file);
13399 output_pic_addr_const (file, XEXP (x, 0), code);
13401 break;
13403 case MINUS:
13404 if (!TARGET_MACHO)
13405 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
13406 output_pic_addr_const (file, XEXP (x, 0), code);
13407 putc ('-', file);
13408 output_pic_addr_const (file, XEXP (x, 1), code);
13409 if (!TARGET_MACHO)
13410 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
13411 break;
13413 case UNSPEC:
13414 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
13416 bool f = i386_asm_output_addr_const_extra (file, x);
13417 gcc_assert (f);
13418 break;
13421 gcc_assert (XVECLEN (x, 0) == 1);
13422 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
13423 switch (XINT (x, 1))
13425 case UNSPEC_GOT:
13426 fputs ("@GOT", file);
13427 break;
13428 case UNSPEC_GOTOFF:
13429 fputs ("@GOTOFF", file);
13430 break;
13431 case UNSPEC_PLTOFF:
13432 fputs ("@PLTOFF", file);
13433 break;
13434 case UNSPEC_PCREL:
13435 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13436 "(%rip)" : "[rip]", file);
13437 break;
13438 case UNSPEC_GOTPCREL:
13439 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13440 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
13441 break;
13442 case UNSPEC_GOTTPOFF:
13443 /* FIXME: This might be @TPOFF in Sun ld too. */
13444 fputs ("@gottpoff", file);
13445 break;
13446 case UNSPEC_TPOFF:
13447 fputs ("@tpoff", file);
13448 break;
13449 case UNSPEC_NTPOFF:
13450 if (TARGET_64BIT)
13451 fputs ("@tpoff", file);
13452 else
13453 fputs ("@ntpoff", file);
13454 break;
13455 case UNSPEC_DTPOFF:
13456 fputs ("@dtpoff", file);
13457 break;
13458 case UNSPEC_GOTNTPOFF:
13459 if (TARGET_64BIT)
13460 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13461 "@gottpoff(%rip)": "@gottpoff[rip]", file);
13462 else
13463 fputs ("@gotntpoff", file);
13464 break;
13465 case UNSPEC_INDNTPOFF:
13466 fputs ("@indntpoff", file);
13467 break;
13468 #if TARGET_MACHO
13469 case UNSPEC_MACHOPIC_OFFSET:
13470 putc ('-', file);
13471 machopic_output_function_base_name (file);
13472 break;
13473 #endif
13474 default:
13475 output_operand_lossage ("invalid UNSPEC as operand");
13476 break;
13478 break;
13480 default:
13481 output_operand_lossage ("invalid expression as operand");
13485 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
13486 We need to emit DTP-relative relocations. */
13488 static void ATTRIBUTE_UNUSED
13489 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
13491 fputs (ASM_LONG, file);
13492 output_addr_const (file, x);
13493 fputs ("@dtpoff", file);
13494 switch (size)
13496 case 4:
13497 break;
13498 case 8:
13499 fputs (", 0", file);
13500 break;
13501 default:
13502 gcc_unreachable ();
13506 /* Return true if X is a representation of the PIC register. This copes
13507 with calls from ix86_find_base_term, where the register might have
13508 been replaced by a cselib value. */
13510 static bool
13511 ix86_pic_register_p (rtx x)
13513 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
13514 return (pic_offset_table_rtx
13515 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
13516 else
13517 return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
13520 /* Helper function for ix86_delegitimize_address.
13521 Attempt to delegitimize TLS local-exec accesses. */
13523 static rtx
13524 ix86_delegitimize_tls_address (rtx orig_x)
13526 rtx x = orig_x, unspec;
13527 struct ix86_address addr;
13529 if (!TARGET_TLS_DIRECT_SEG_REFS)
13530 return orig_x;
13531 if (MEM_P (x))
13532 x = XEXP (x, 0);
13533 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
13534 return orig_x;
13535 if (ix86_decompose_address (x, &addr) == 0
13536 || addr.seg != (TARGET_64BIT ? SEG_FS : SEG_GS)
13537 || addr.disp == NULL_RTX
13538 || GET_CODE (addr.disp) != CONST)
13539 return orig_x;
13540 unspec = XEXP (addr.disp, 0);
13541 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
13542 unspec = XEXP (unspec, 0);
13543 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
13544 return orig_x;
13545 x = XVECEXP (unspec, 0, 0);
13546 gcc_assert (GET_CODE (x) == SYMBOL_REF);
13547 if (unspec != XEXP (addr.disp, 0))
13548 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
13549 if (addr.index)
13551 rtx idx = addr.index;
13552 if (addr.scale != 1)
13553 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
13554 x = gen_rtx_PLUS (Pmode, idx, x);
13556 if (addr.base)
13557 x = gen_rtx_PLUS (Pmode, addr.base, x);
13558 if (MEM_P (orig_x))
13559 x = replace_equiv_address_nv (orig_x, x);
13560 return x;
13563 /* In the name of slightly smaller debug output, and to cater to
13564 general assembler lossage, recognize PIC+GOTOFF and turn it back
13565 into a direct symbol reference.
13567 On Darwin, this is necessary to avoid a crash, because Darwin
13568 has a different PIC label for each routine but the DWARF debugging
13569 information is not associated with any particular routine, so it's
13570 necessary to remove references to the PIC label from RTL stored by
13571 the DWARF output code. */
13573 static rtx
13574 ix86_delegitimize_address (rtx x)
13576 rtx orig_x = delegitimize_mem_from_attrs (x);
13577 /* addend is NULL or some rtx if x is something+GOTOFF where
13578 something doesn't include the PIC register. */
13579 rtx addend = NULL_RTX;
13580 /* reg_addend is NULL or a multiple of some register. */
13581 rtx reg_addend = NULL_RTX;
13582 /* const_addend is NULL or a const_int. */
13583 rtx const_addend = NULL_RTX;
13584 /* This is the result, or NULL. */
13585 rtx result = NULL_RTX;
13587 x = orig_x;
13589 if (MEM_P (x))
13590 x = XEXP (x, 0);
13592 if (TARGET_64BIT)
13594 if (GET_CODE (x) == CONST
13595 && GET_CODE (XEXP (x, 0)) == PLUS
13596 && GET_MODE (XEXP (x, 0)) == Pmode
13597 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13598 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
13599 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
13601 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
13602 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
13603 if (MEM_P (orig_x))
13604 x = replace_equiv_address_nv (orig_x, x);
13605 return x;
13607 if (GET_CODE (x) != CONST
13608 || GET_CODE (XEXP (x, 0)) != UNSPEC
13609 || (XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL
13610 && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL)
13611 || (!MEM_P (orig_x) && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL))
13612 return ix86_delegitimize_tls_address (orig_x);
13613 x = XVECEXP (XEXP (x, 0), 0, 0);
13614 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
13616 x = simplify_gen_subreg (GET_MODE (orig_x), x,
13617 GET_MODE (x), 0);
13618 if (x == NULL_RTX)
13619 return orig_x;
13621 return x;
13624 if (GET_CODE (x) != PLUS
13625 || GET_CODE (XEXP (x, 1)) != CONST)
13626 return ix86_delegitimize_tls_address (orig_x);
13628 if (ix86_pic_register_p (XEXP (x, 0)))
13629 /* %ebx + GOT/GOTOFF */
13631 else if (GET_CODE (XEXP (x, 0)) == PLUS)
13633 /* %ebx + %reg * scale + GOT/GOTOFF */
13634 reg_addend = XEXP (x, 0);
13635 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
13636 reg_addend = XEXP (reg_addend, 1);
13637 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
13638 reg_addend = XEXP (reg_addend, 0);
13639 else
13641 reg_addend = NULL_RTX;
13642 addend = XEXP (x, 0);
13645 else
13646 addend = XEXP (x, 0);
13648 x = XEXP (XEXP (x, 1), 0);
13649 if (GET_CODE (x) == PLUS
13650 && CONST_INT_P (XEXP (x, 1)))
13652 const_addend = XEXP (x, 1);
13653 x = XEXP (x, 0);
13656 if (GET_CODE (x) == UNSPEC
13657 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
13658 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))))
13659 result = XVECEXP (x, 0, 0);
13661 if (TARGET_MACHO && darwin_local_data_pic (x)
13662 && !MEM_P (orig_x))
13663 result = XVECEXP (x, 0, 0);
13665 if (! result)
13666 return ix86_delegitimize_tls_address (orig_x);
13668 if (const_addend)
13669 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
13670 if (reg_addend)
13671 result = gen_rtx_PLUS (Pmode, reg_addend, result);
13672 if (addend)
13674 /* If the rest of original X doesn't involve the PIC register, add
13675 addend and subtract pic_offset_table_rtx. This can happen e.g.
13676 for code like:
13677 leal (%ebx, %ecx, 4), %ecx
13679 movl foo@GOTOFF(%ecx), %edx
13680 in which case we return (%ecx - %ebx) + foo. */
13681 if (pic_offset_table_rtx)
13682 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
13683 pic_offset_table_rtx),
13684 result);
13685 else
13686 return orig_x;
13688 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
13690 result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
13691 if (result == NULL_RTX)
13692 return orig_x;
13694 return result;
13697 /* If X is a machine specific address (i.e. a symbol or label being
13698 referenced as a displacement from the GOT implemented using an
13699 UNSPEC), then return the base term. Otherwise return X. */
13702 ix86_find_base_term (rtx x)
13704 rtx term;
13706 if (TARGET_64BIT)
13708 if (GET_CODE (x) != CONST)
13709 return x;
13710 term = XEXP (x, 0);
13711 if (GET_CODE (term) == PLUS
13712 && (CONST_INT_P (XEXP (term, 1))
13713 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
13714 term = XEXP (term, 0);
13715 if (GET_CODE (term) != UNSPEC
13716 || (XINT (term, 1) != UNSPEC_GOTPCREL
13717 && XINT (term, 1) != UNSPEC_PCREL))
13718 return x;
13720 return XVECEXP (term, 0, 0);
13723 return ix86_delegitimize_address (x);
13726 static void
13727 put_condition_code (enum rtx_code code, enum machine_mode mode, bool reverse,
13728 bool fp, FILE *file)
13730 const char *suffix;
13732 if (mode == CCFPmode || mode == CCFPUmode)
13734 code = ix86_fp_compare_code_to_integer (code);
13735 mode = CCmode;
13737 if (reverse)
13738 code = reverse_condition (code);
13740 switch (code)
13742 case EQ:
13743 switch (mode)
13745 case CCAmode:
13746 suffix = "a";
13747 break;
13749 case CCCmode:
13750 suffix = "c";
13751 break;
13753 case CCOmode:
13754 suffix = "o";
13755 break;
13757 case CCSmode:
13758 suffix = "s";
13759 break;
13761 default:
13762 suffix = "e";
13764 break;
13765 case NE:
13766 switch (mode)
13768 case CCAmode:
13769 suffix = "na";
13770 break;
13772 case CCCmode:
13773 suffix = "nc";
13774 break;
13776 case CCOmode:
13777 suffix = "no";
13778 break;
13780 case CCSmode:
13781 suffix = "ns";
13782 break;
13784 default:
13785 suffix = "ne";
13787 break;
13788 case GT:
13789 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
13790 suffix = "g";
13791 break;
13792 case GTU:
13793 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
13794 Those same assemblers have the same but opposite lossage on cmov. */
13795 if (mode == CCmode)
13796 suffix = fp ? "nbe" : "a";
13797 else if (mode == CCCmode)
13798 suffix = "b";
13799 else
13800 gcc_unreachable ();
13801 break;
13802 case LT:
13803 switch (mode)
13805 case CCNOmode:
13806 case CCGOCmode:
13807 suffix = "s";
13808 break;
13810 case CCmode:
13811 case CCGCmode:
13812 suffix = "l";
13813 break;
13815 default:
13816 gcc_unreachable ();
13818 break;
13819 case LTU:
13820 gcc_assert (mode == CCmode || mode == CCCmode);
13821 suffix = "b";
13822 break;
13823 case GE:
13824 switch (mode)
13826 case CCNOmode:
13827 case CCGOCmode:
13828 suffix = "ns";
13829 break;
13831 case CCmode:
13832 case CCGCmode:
13833 suffix = "ge";
13834 break;
13836 default:
13837 gcc_unreachable ();
13839 break;
13840 case GEU:
13841 /* ??? As above. */
13842 gcc_assert (mode == CCmode || mode == CCCmode);
13843 suffix = fp ? "nb" : "ae";
13844 break;
13845 case LE:
13846 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
13847 suffix = "le";
13848 break;
13849 case LEU:
13850 /* ??? As above. */
13851 if (mode == CCmode)
13852 suffix = "be";
13853 else if (mode == CCCmode)
13854 suffix = fp ? "nb" : "ae";
13855 else
13856 gcc_unreachable ();
13857 break;
13858 case UNORDERED:
13859 suffix = fp ? "u" : "p";
13860 break;
13861 case ORDERED:
13862 suffix = fp ? "nu" : "np";
13863 break;
13864 default:
13865 gcc_unreachable ();
13867 fputs (suffix, file);
13870 /* Print the name of register X to FILE based on its machine mode and number.
13871 If CODE is 'w', pretend the mode is HImode.
13872 If CODE is 'b', pretend the mode is QImode.
13873 If CODE is 'k', pretend the mode is SImode.
13874 If CODE is 'q', pretend the mode is DImode.
13875 If CODE is 'x', pretend the mode is V4SFmode.
13876 If CODE is 't', pretend the mode is V8SFmode.
13877 If CODE is 'h', pretend the reg is the 'high' byte register.
13878 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
13879 If CODE is 'd', duplicate the operand for AVX instruction.
13882 void
13883 print_reg (rtx x, int code, FILE *file)
13885 const char *reg;
13886 unsigned int regno;
13887 bool duplicated = code == 'd' && TARGET_AVX;
13889 if (ASSEMBLER_DIALECT == ASM_ATT)
13890 putc ('%', file);
13892 if (x == pc_rtx)
13894 gcc_assert (TARGET_64BIT);
13895 fputs ("rip", file);
13896 return;
13899 regno = true_regnum (x);
13900 gcc_assert (regno != ARG_POINTER_REGNUM
13901 && regno != FRAME_POINTER_REGNUM
13902 && regno != FLAGS_REG
13903 && regno != FPSR_REG
13904 && regno != FPCR_REG);
13906 if (code == 'w' || MMX_REG_P (x))
13907 code = 2;
13908 else if (code == 'b')
13909 code = 1;
13910 else if (code == 'k')
13911 code = 4;
13912 else if (code == 'q')
13913 code = 8;
13914 else if (code == 'y')
13915 code = 3;
13916 else if (code == 'h')
13917 code = 0;
13918 else if (code == 'x')
13919 code = 16;
13920 else if (code == 't')
13921 code = 32;
13922 else
13923 code = GET_MODE_SIZE (GET_MODE (x));
13925 /* Irritatingly, AMD extended registers use different naming convention
13926 from the normal registers: "r%d[bwd]" */
13927 if (REX_INT_REGNO_P (regno))
13929 gcc_assert (TARGET_64BIT);
13930 putc ('r', file);
13931 fprint_ul (file, regno - FIRST_REX_INT_REG + 8);
13932 switch (code)
13934 case 0:
13935 error ("extended registers have no high halves");
13936 break;
13937 case 1:
13938 putc ('b', file);
13939 break;
13940 case 2:
13941 putc ('w', file);
13942 break;
13943 case 4:
13944 putc ('d', file);
13945 break;
13946 case 8:
13947 /* no suffix */
13948 break;
13949 default:
13950 error ("unsupported operand size for extended register");
13951 break;
13953 return;
13956 reg = NULL;
13957 switch (code)
13959 case 3:
13960 if (STACK_TOP_P (x))
13962 reg = "st(0)";
13963 break;
13965 /* FALLTHRU */
13966 case 8:
13967 case 4:
13968 case 12:
13969 if (! ANY_FP_REG_P (x))
13970 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
13971 /* FALLTHRU */
13972 case 16:
13973 case 2:
13974 normal:
13975 reg = hi_reg_name[regno];
13976 break;
13977 case 1:
13978 if (regno >= ARRAY_SIZE (qi_reg_name))
13979 goto normal;
13980 reg = qi_reg_name[regno];
13981 break;
13982 case 0:
13983 if (regno >= ARRAY_SIZE (qi_high_reg_name))
13984 goto normal;
13985 reg = qi_high_reg_name[regno];
13986 break;
13987 case 32:
13988 if (SSE_REG_P (x))
13990 gcc_assert (!duplicated);
13991 putc ('y', file);
13992 fputs (hi_reg_name[regno] + 1, file);
13993 return;
13995 break;
13996 default:
13997 gcc_unreachable ();
14000 fputs (reg, file);
14001 if (duplicated)
14003 if (ASSEMBLER_DIALECT == ASM_ATT)
14004 fprintf (file, ", %%%s", reg);
14005 else
14006 fprintf (file, ", %s", reg);
14010 /* Locate some local-dynamic symbol still in use by this function
14011 so that we can print its name in some tls_local_dynamic_base
14012 pattern. */
14014 static int
14015 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
14017 rtx x = *px;
14019 if (GET_CODE (x) == SYMBOL_REF
14020 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
14022 cfun->machine->some_ld_name = XSTR (x, 0);
14023 return 1;
14026 return 0;
14029 static const char *
14030 get_some_local_dynamic_name (void)
14032 rtx insn;
14034 if (cfun->machine->some_ld_name)
14035 return cfun->machine->some_ld_name;
14037 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
14038 if (NONDEBUG_INSN_P (insn)
14039 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
14040 return cfun->machine->some_ld_name;
14042 return NULL;
14045 /* Meaning of CODE:
14046 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
14047 C -- print opcode suffix for set/cmov insn.
14048 c -- like C, but print reversed condition
14049 F,f -- likewise, but for floating-point.
14050 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
14051 otherwise nothing
14052 R -- print the prefix for register names.
14053 z -- print the opcode suffix for the size of the current operand.
14054 Z -- likewise, with special suffixes for x87 instructions.
14055 * -- print a star (in certain assembler syntax)
14056 A -- print an absolute memory reference.
14057 E -- print address with DImode register names if TARGET_64BIT.
14058 w -- print the operand as if it's a "word" (HImode) even if it isn't.
14059 s -- print a shift double count, followed by the assemblers argument
14060 delimiter.
14061 b -- print the QImode name of the register for the indicated operand.
14062 %b0 would print %al if operands[0] is reg 0.
14063 w -- likewise, print the HImode name of the register.
14064 k -- likewise, print the SImode name of the register.
14065 q -- likewise, print the DImode name of the register.
14066 x -- likewise, print the V4SFmode name of the register.
14067 t -- likewise, print the V8SFmode name of the register.
14068 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
14069 y -- print "st(0)" instead of "st" as a register.
14070 d -- print duplicated register operand for AVX instruction.
14071 D -- print condition for SSE cmp instruction.
14072 P -- if PIC, print an @PLT suffix.
14073 p -- print raw symbol name.
14074 X -- don't print any sort of PIC '@' suffix for a symbol.
14075 & -- print some in-use local-dynamic symbol name.
14076 H -- print a memory address offset by 8; used for sse high-parts
14077 Y -- print condition for XOP pcom* instruction.
14078 + -- print a branch hint as 'cs' or 'ds' prefix
14079 ; -- print a semicolon (after prefixes due to bug in older gas).
14080 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
14081 @ -- print a segment register of thread base pointer load
14082 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
14085 void
14086 ix86_print_operand (FILE *file, rtx x, int code)
14088 if (code)
14090 switch (code)
14092 case 'A':
14093 switch (ASSEMBLER_DIALECT)
14095 case ASM_ATT:
14096 putc ('*', file);
14097 break;
14099 case ASM_INTEL:
14100 /* Intel syntax. For absolute addresses, registers should not
14101 be surrounded by braces. */
14102 if (!REG_P (x))
14104 putc ('[', file);
14105 ix86_print_operand (file, x, 0);
14106 putc (']', file);
14107 return;
14109 break;
14111 default:
14112 gcc_unreachable ();
14115 ix86_print_operand (file, x, 0);
14116 return;
14118 case 'E':
14119 /* Wrap address in an UNSPEC to declare special handling. */
14120 if (TARGET_64BIT)
14121 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
14123 output_address (x);
14124 return;
14126 case 'L':
14127 if (ASSEMBLER_DIALECT == ASM_ATT)
14128 putc ('l', file);
14129 return;
14131 case 'W':
14132 if (ASSEMBLER_DIALECT == ASM_ATT)
14133 putc ('w', file);
14134 return;
14136 case 'B':
14137 if (ASSEMBLER_DIALECT == ASM_ATT)
14138 putc ('b', file);
14139 return;
14141 case 'Q':
14142 if (ASSEMBLER_DIALECT == ASM_ATT)
14143 putc ('l', file);
14144 return;
14146 case 'S':
14147 if (ASSEMBLER_DIALECT == ASM_ATT)
14148 putc ('s', file);
14149 return;
14151 case 'T':
14152 if (ASSEMBLER_DIALECT == ASM_ATT)
14153 putc ('t', file);
14154 return;
14156 case 'O':
14157 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14158 if (ASSEMBLER_DIALECT != ASM_ATT)
14159 return;
14161 switch (GET_MODE_SIZE (GET_MODE (x)))
14163 case 2:
14164 putc ('w', file);
14165 break;
14167 case 4:
14168 putc ('l', file);
14169 break;
14171 case 8:
14172 putc ('q', file);
14173 break;
14175 default:
14176 output_operand_lossage
14177 ("invalid operand size for operand code 'O'");
14178 return;
14181 putc ('.', file);
14182 #endif
14183 return;
14185 case 'z':
14186 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14188 /* Opcodes don't get size suffixes if using Intel opcodes. */
14189 if (ASSEMBLER_DIALECT == ASM_INTEL)
14190 return;
14192 switch (GET_MODE_SIZE (GET_MODE (x)))
14194 case 1:
14195 putc ('b', file);
14196 return;
14198 case 2:
14199 putc ('w', file);
14200 return;
14202 case 4:
14203 putc ('l', file);
14204 return;
14206 case 8:
14207 putc ('q', file);
14208 return;
14210 default:
14211 output_operand_lossage
14212 ("invalid operand size for operand code 'z'");
14213 return;
14217 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14218 warning
14219 (0, "non-integer operand used with operand code 'z'");
14220 /* FALLTHRU */
14222 case 'Z':
14223 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
14224 if (ASSEMBLER_DIALECT == ASM_INTEL)
14225 return;
14227 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14229 switch (GET_MODE_SIZE (GET_MODE (x)))
14231 case 2:
14232 #ifdef HAVE_AS_IX86_FILDS
14233 putc ('s', file);
14234 #endif
14235 return;
14237 case 4:
14238 putc ('l', file);
14239 return;
14241 case 8:
14242 #ifdef HAVE_AS_IX86_FILDQ
14243 putc ('q', file);
14244 #else
14245 fputs ("ll", file);
14246 #endif
14247 return;
14249 default:
14250 break;
14253 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14255 /* 387 opcodes don't get size suffixes
14256 if the operands are registers. */
14257 if (STACK_REG_P (x))
14258 return;
14260 switch (GET_MODE_SIZE (GET_MODE (x)))
14262 case 4:
14263 putc ('s', file);
14264 return;
14266 case 8:
14267 putc ('l', file);
14268 return;
14270 case 12:
14271 case 16:
14272 putc ('t', file);
14273 return;
14275 default:
14276 break;
14279 else
14281 output_operand_lossage
14282 ("invalid operand type used with operand code 'Z'");
14283 return;
14286 output_operand_lossage
14287 ("invalid operand size for operand code 'Z'");
14288 return;
14290 case 'd':
14291 case 'b':
14292 case 'w':
14293 case 'k':
14294 case 'q':
14295 case 'h':
14296 case 't':
14297 case 'y':
14298 case 'x':
14299 case 'X':
14300 case 'P':
14301 case 'p':
14302 break;
14304 case 's':
14305 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
14307 ix86_print_operand (file, x, 0);
14308 fputs (", ", file);
14310 return;
14312 case 'Y':
14313 switch (GET_CODE (x))
14315 case NE:
14316 fputs ("neq", file);
14317 break;
14318 case EQ:
14319 fputs ("eq", file);
14320 break;
14321 case GE:
14322 case GEU:
14323 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14324 break;
14325 case GT:
14326 case GTU:
14327 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14328 break;
14329 case LE:
14330 case LEU:
14331 fputs ("le", file);
14332 break;
14333 case LT:
14334 case LTU:
14335 fputs ("lt", file);
14336 break;
14337 case UNORDERED:
14338 fputs ("unord", file);
14339 break;
14340 case ORDERED:
14341 fputs ("ord", file);
14342 break;
14343 case UNEQ:
14344 fputs ("ueq", file);
14345 break;
14346 case UNGE:
14347 fputs ("nlt", file);
14348 break;
14349 case UNGT:
14350 fputs ("nle", file);
14351 break;
14352 case UNLE:
14353 fputs ("ule", file);
14354 break;
14355 case UNLT:
14356 fputs ("ult", file);
14357 break;
14358 case LTGT:
14359 fputs ("une", file);
14360 break;
14361 default:
14362 output_operand_lossage ("operand is not a condition code, "
14363 "invalid operand code 'Y'");
14364 return;
14366 return;
14368 case 'D':
14369 /* Little bit of braindamage here. The SSE compare instructions
14370 does use completely different names for the comparisons that the
14371 fp conditional moves. */
14372 switch (GET_CODE (x))
14374 case UNEQ:
14375 if (TARGET_AVX)
14377 fputs ("eq_us", file);
14378 break;
14380 case EQ:
14381 fputs ("eq", file);
14382 break;
14383 case UNLT:
14384 if (TARGET_AVX)
14386 fputs ("nge", file);
14387 break;
14389 case LT:
14390 fputs ("lt", file);
14391 break;
14392 case UNLE:
14393 if (TARGET_AVX)
14395 fputs ("ngt", file);
14396 break;
14398 case LE:
14399 fputs ("le", file);
14400 break;
14401 case UNORDERED:
14402 fputs ("unord", file);
14403 break;
14404 case LTGT:
14405 if (TARGET_AVX)
14407 fputs ("neq_oq", file);
14408 break;
14410 case NE:
14411 fputs ("neq", file);
14412 break;
14413 case GE:
14414 if (TARGET_AVX)
14416 fputs ("ge", file);
14417 break;
14419 case UNGE:
14420 fputs ("nlt", file);
14421 break;
14422 case GT:
14423 if (TARGET_AVX)
14425 fputs ("gt", file);
14426 break;
14428 case UNGT:
14429 fputs ("nle", file);
14430 break;
14431 case ORDERED:
14432 fputs ("ord", file);
14433 break;
14434 default:
14435 output_operand_lossage ("operand is not a condition code, "
14436 "invalid operand code 'D'");
14437 return;
14439 return;
14441 case 'F':
14442 case 'f':
14443 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14444 if (ASSEMBLER_DIALECT == ASM_ATT)
14445 putc ('.', file);
14446 #endif
14448 case 'C':
14449 case 'c':
14450 if (!COMPARISON_P (x))
14452 output_operand_lossage ("operand is not a condition code, "
14453 "invalid operand code '%c'", code);
14454 return;
14456 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
14457 code == 'c' || code == 'f',
14458 code == 'F' || code == 'f',
14459 file);
14460 return;
14462 case 'H':
14463 if (!offsettable_memref_p (x))
14465 output_operand_lossage ("operand is not an offsettable memory "
14466 "reference, invalid operand code 'H'");
14467 return;
14469 /* It doesn't actually matter what mode we use here, as we're
14470 only going to use this for printing. */
14471 x = adjust_address_nv (x, DImode, 8);
14472 break;
14474 case 'K':
14475 gcc_assert (CONST_INT_P (x));
14477 if (INTVAL (x) & IX86_HLE_ACQUIRE)
14478 #ifdef HAVE_AS_IX86_HLE
14479 fputs ("xacquire ", file);
14480 #else
14481 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
14482 #endif
14483 else if (INTVAL (x) & IX86_HLE_RELEASE)
14484 #ifdef HAVE_AS_IX86_HLE
14485 fputs ("xrelease ", file);
14486 #else
14487 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
14488 #endif
14489 /* We do not want to print value of the operand. */
14490 return;
14492 case '*':
14493 if (ASSEMBLER_DIALECT == ASM_ATT)
14494 putc ('*', file);
14495 return;
14497 case '&':
14499 const char *name = get_some_local_dynamic_name ();
14500 if (name == NULL)
14501 output_operand_lossage ("'%%&' used without any "
14502 "local dynamic TLS references");
14503 else
14504 assemble_name (file, name);
14505 return;
14508 case '+':
14510 rtx x;
14512 if (!optimize
14513 || optimize_function_for_size_p (cfun)
14514 || !TARGET_BRANCH_PREDICTION_HINTS)
14515 return;
14517 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
14518 if (x)
14520 int pred_val = INTVAL (XEXP (x, 0));
14522 if (pred_val < REG_BR_PROB_BASE * 45 / 100
14523 || pred_val > REG_BR_PROB_BASE * 55 / 100)
14525 bool taken = pred_val > REG_BR_PROB_BASE / 2;
14526 bool cputaken
14527 = final_forward_branch_p (current_output_insn) == 0;
14529 /* Emit hints only in the case default branch prediction
14530 heuristics would fail. */
14531 if (taken != cputaken)
14533 /* We use 3e (DS) prefix for taken branches and
14534 2e (CS) prefix for not taken branches. */
14535 if (taken)
14536 fputs ("ds ; ", file);
14537 else
14538 fputs ("cs ; ", file);
14542 return;
14545 case ';':
14546 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
14547 putc (';', file);
14548 #endif
14549 return;
14551 case '@':
14552 if (ASSEMBLER_DIALECT == ASM_ATT)
14553 putc ('%', file);
14555 /* The kernel uses a different segment register for performance
14556 reasons; a system call would not have to trash the userspace
14557 segment register, which would be expensive. */
14558 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
14559 fputs ("fs", file);
14560 else
14561 fputs ("gs", file);
14562 return;
14564 case '~':
14565 putc (TARGET_AVX2 ? 'i' : 'f', file);
14566 return;
14568 case '^':
14569 if (TARGET_64BIT && Pmode != word_mode)
14570 fputs ("addr32 ", file);
14571 return;
14573 default:
14574 output_operand_lossage ("invalid operand code '%c'", code);
14578 if (REG_P (x))
14579 print_reg (x, code, file);
14581 else if (MEM_P (x))
14583 /* No `byte ptr' prefix for call instructions or BLKmode operands. */
14584 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
14585 && GET_MODE (x) != BLKmode)
14587 const char * size;
14588 switch (GET_MODE_SIZE (GET_MODE (x)))
14590 case 1: size = "BYTE"; break;
14591 case 2: size = "WORD"; break;
14592 case 4: size = "DWORD"; break;
14593 case 8: size = "QWORD"; break;
14594 case 12: size = "TBYTE"; break;
14595 case 16:
14596 if (GET_MODE (x) == XFmode)
14597 size = "TBYTE";
14598 else
14599 size = "XMMWORD";
14600 break;
14601 case 32: size = "YMMWORD"; break;
14602 default:
14603 gcc_unreachable ();
14606 /* Check for explicit size override (codes 'b', 'w', 'k',
14607 'q' and 'x') */
14608 if (code == 'b')
14609 size = "BYTE";
14610 else if (code == 'w')
14611 size = "WORD";
14612 else if (code == 'k')
14613 size = "DWORD";
14614 else if (code == 'q')
14615 size = "QWORD";
14616 else if (code == 'x')
14617 size = "XMMWORD";
14619 fputs (size, file);
14620 fputs (" PTR ", file);
14623 x = XEXP (x, 0);
14624 /* Avoid (%rip) for call operands. */
14625 if (CONSTANT_ADDRESS_P (x) && code == 'P'
14626 && !CONST_INT_P (x))
14627 output_addr_const (file, x);
14628 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
14629 output_operand_lossage ("invalid constraints for operand");
14630 else
14631 output_address (x);
14634 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
14636 REAL_VALUE_TYPE r;
14637 long l;
14639 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14640 REAL_VALUE_TO_TARGET_SINGLE (r, l);
14642 if (ASSEMBLER_DIALECT == ASM_ATT)
14643 putc ('$', file);
14644 /* Sign extend 32bit SFmode immediate to 8 bytes. */
14645 if (code == 'q')
14646 fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
14647 (unsigned long long) (int) l);
14648 else
14649 fprintf (file, "0x%08x", (unsigned int) l);
14652 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
14654 REAL_VALUE_TYPE r;
14655 long l[2];
14657 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14658 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
14660 if (ASSEMBLER_DIALECT == ASM_ATT)
14661 putc ('$', file);
14662 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
14665 /* These float cases don't actually occur as immediate operands. */
14666 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
14668 char dstr[30];
14670 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
14671 fputs (dstr, file);
14674 else
14676 /* We have patterns that allow zero sets of memory, for instance.
14677 In 64-bit mode, we should probably support all 8-byte vectors,
14678 since we can in fact encode that into an immediate. */
14679 if (GET_CODE (x) == CONST_VECTOR)
14681 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
14682 x = const0_rtx;
14685 if (code != 'P' && code != 'p')
14687 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
14689 if (ASSEMBLER_DIALECT == ASM_ATT)
14690 putc ('$', file);
14692 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
14693 || GET_CODE (x) == LABEL_REF)
14695 if (ASSEMBLER_DIALECT == ASM_ATT)
14696 putc ('$', file);
14697 else
14698 fputs ("OFFSET FLAT:", file);
14701 if (CONST_INT_P (x))
14702 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
14703 else if (flag_pic || MACHOPIC_INDIRECT)
14704 output_pic_addr_const (file, x, code);
14705 else
14706 output_addr_const (file, x);
14710 static bool
14711 ix86_print_operand_punct_valid_p (unsigned char code)
14713 return (code == '@' || code == '*' || code == '+' || code == '&'
14714 || code == ';' || code == '~' || code == '^');
14717 /* Print a memory operand whose address is ADDR. */
14719 static void
14720 ix86_print_operand_address (FILE *file, rtx addr)
14722 struct ix86_address parts;
14723 rtx base, index, disp;
14724 int scale;
14725 int ok;
14726 bool vsib = false;
14727 int code = 0;
14729 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
14731 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14732 gcc_assert (parts.index == NULL_RTX);
14733 parts.index = XVECEXP (addr, 0, 1);
14734 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
14735 addr = XVECEXP (addr, 0, 0);
14736 vsib = true;
14738 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
14740 gcc_assert (TARGET_64BIT);
14741 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14742 code = 'q';
14744 else
14745 ok = ix86_decompose_address (addr, &parts);
14747 gcc_assert (ok);
14749 base = parts.base;
14750 index = parts.index;
14751 disp = parts.disp;
14752 scale = parts.scale;
14754 switch (parts.seg)
14756 case SEG_DEFAULT:
14757 break;
14758 case SEG_FS:
14759 case SEG_GS:
14760 if (ASSEMBLER_DIALECT == ASM_ATT)
14761 putc ('%', file);
14762 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
14763 break;
14764 default:
14765 gcc_unreachable ();
14768 /* Use one byte shorter RIP relative addressing for 64bit mode. */
14769 if (TARGET_64BIT && !base && !index)
14771 rtx symbol = disp;
14773 if (GET_CODE (disp) == CONST
14774 && GET_CODE (XEXP (disp, 0)) == PLUS
14775 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14776 symbol = XEXP (XEXP (disp, 0), 0);
14778 if (GET_CODE (symbol) == LABEL_REF
14779 || (GET_CODE (symbol) == SYMBOL_REF
14780 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
14781 base = pc_rtx;
14783 if (!base && !index)
14785 /* Displacement only requires special attention. */
14787 if (CONST_INT_P (disp))
14789 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
14790 fputs ("ds:", file);
14791 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
14793 else if (flag_pic)
14794 output_pic_addr_const (file, disp, 0);
14795 else
14796 output_addr_const (file, disp);
14798 else
14800 /* Print SImode register names to force addr32 prefix. */
14801 if (SImode_address_operand (addr, VOIDmode))
14803 #ifdef ENABLE_CHECKING
14804 gcc_assert (TARGET_64BIT);
14805 switch (GET_CODE (addr))
14807 case SUBREG:
14808 gcc_assert (GET_MODE (addr) == SImode);
14809 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
14810 break;
14811 case ZERO_EXTEND:
14812 case AND:
14813 gcc_assert (GET_MODE (addr) == DImode);
14814 break;
14815 default:
14816 gcc_unreachable ();
14818 #endif
14819 gcc_assert (!code);
14820 code = 'k';
14822 else if (code == 0
14823 && TARGET_X32
14824 && disp
14825 && CONST_INT_P (disp)
14826 && INTVAL (disp) < -16*1024*1024)
14828 /* X32 runs in 64-bit mode, where displacement, DISP, in
14829 address DISP(%r64), is encoded as 32-bit immediate sign-
14830 extended from 32-bit to 64-bit. For -0x40000300(%r64),
14831 address is %r64 + 0xffffffffbffffd00. When %r64 <
14832 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
14833 which is invalid for x32. The correct address is %r64
14834 - 0x40000300 == 0xf7ffdd64. To properly encode
14835 -0x40000300(%r64) for x32, we zero-extend negative
14836 displacement by forcing addr32 prefix which truncates
14837 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
14838 zero-extend all negative displacements, including -1(%rsp).
14839 However, for small negative displacements, sign-extension
14840 won't cause overflow. We only zero-extend negative
14841 displacements if they < -16*1024*1024, which is also used
14842 to check legitimate address displacements for PIC. */
14843 code = 'k';
14846 if (ASSEMBLER_DIALECT == ASM_ATT)
14848 if (disp)
14850 if (flag_pic)
14851 output_pic_addr_const (file, disp, 0);
14852 else if (GET_CODE (disp) == LABEL_REF)
14853 output_asm_label (disp);
14854 else
14855 output_addr_const (file, disp);
14858 putc ('(', file);
14859 if (base)
14860 print_reg (base, code, file);
14861 if (index)
14863 putc (',', file);
14864 print_reg (index, vsib ? 0 : code, file);
14865 if (scale != 1 || vsib)
14866 fprintf (file, ",%d", scale);
14868 putc (')', file);
14870 else
14872 rtx offset = NULL_RTX;
14874 if (disp)
14876 /* Pull out the offset of a symbol; print any symbol itself. */
14877 if (GET_CODE (disp) == CONST
14878 && GET_CODE (XEXP (disp, 0)) == PLUS
14879 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14881 offset = XEXP (XEXP (disp, 0), 1);
14882 disp = gen_rtx_CONST (VOIDmode,
14883 XEXP (XEXP (disp, 0), 0));
14886 if (flag_pic)
14887 output_pic_addr_const (file, disp, 0);
14888 else if (GET_CODE (disp) == LABEL_REF)
14889 output_asm_label (disp);
14890 else if (CONST_INT_P (disp))
14891 offset = disp;
14892 else
14893 output_addr_const (file, disp);
14896 putc ('[', file);
14897 if (base)
14899 print_reg (base, code, file);
14900 if (offset)
14902 if (INTVAL (offset) >= 0)
14903 putc ('+', file);
14904 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14907 else if (offset)
14908 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14909 else
14910 putc ('0', file);
14912 if (index)
14914 putc ('+', file);
14915 print_reg (index, vsib ? 0 : code, file);
14916 if (scale != 1 || vsib)
14917 fprintf (file, "*%d", scale);
14919 putc (']', file);
14924 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
14926 static bool
14927 i386_asm_output_addr_const_extra (FILE *file, rtx x)
14929 rtx op;
14931 if (GET_CODE (x) != UNSPEC)
14932 return false;
14934 op = XVECEXP (x, 0, 0);
14935 switch (XINT (x, 1))
14937 case UNSPEC_GOTTPOFF:
14938 output_addr_const (file, op);
14939 /* FIXME: This might be @TPOFF in Sun ld. */
14940 fputs ("@gottpoff", file);
14941 break;
14942 case UNSPEC_TPOFF:
14943 output_addr_const (file, op);
14944 fputs ("@tpoff", file);
14945 break;
14946 case UNSPEC_NTPOFF:
14947 output_addr_const (file, op);
14948 if (TARGET_64BIT)
14949 fputs ("@tpoff", file);
14950 else
14951 fputs ("@ntpoff", file);
14952 break;
14953 case UNSPEC_DTPOFF:
14954 output_addr_const (file, op);
14955 fputs ("@dtpoff", file);
14956 break;
14957 case UNSPEC_GOTNTPOFF:
14958 output_addr_const (file, op);
14959 if (TARGET_64BIT)
14960 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
14961 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
14962 else
14963 fputs ("@gotntpoff", file);
14964 break;
14965 case UNSPEC_INDNTPOFF:
14966 output_addr_const (file, op);
14967 fputs ("@indntpoff", file);
14968 break;
14969 #if TARGET_MACHO
14970 case UNSPEC_MACHOPIC_OFFSET:
14971 output_addr_const (file, op);
14972 putc ('-', file);
14973 machopic_output_function_base_name (file);
14974 break;
14975 #endif
14977 case UNSPEC_STACK_CHECK:
14979 int offset;
14981 gcc_assert (flag_split_stack);
14983 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
14984 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
14985 #else
14986 gcc_unreachable ();
14987 #endif
14989 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
14991 break;
14993 default:
14994 return false;
14997 return true;
15000 /* Split one or more double-mode RTL references into pairs of half-mode
15001 references. The RTL can be REG, offsettable MEM, integer constant, or
15002 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
15003 split and "num" is its length. lo_half and hi_half are output arrays
15004 that parallel "operands". */
15006 void
15007 split_double_mode (enum machine_mode mode, rtx operands[],
15008 int num, rtx lo_half[], rtx hi_half[])
15010 enum machine_mode half_mode;
15011 unsigned int byte;
15013 switch (mode)
15015 case TImode:
15016 half_mode = DImode;
15017 break;
15018 case DImode:
15019 half_mode = SImode;
15020 break;
15021 default:
15022 gcc_unreachable ();
15025 byte = GET_MODE_SIZE (half_mode);
15027 while (num--)
15029 rtx op = operands[num];
15031 /* simplify_subreg refuse to split volatile memory addresses,
15032 but we still have to handle it. */
15033 if (MEM_P (op))
15035 lo_half[num] = adjust_address (op, half_mode, 0);
15036 hi_half[num] = adjust_address (op, half_mode, byte);
15038 else
15040 lo_half[num] = simplify_gen_subreg (half_mode, op,
15041 GET_MODE (op) == VOIDmode
15042 ? mode : GET_MODE (op), 0);
15043 hi_half[num] = simplify_gen_subreg (half_mode, op,
15044 GET_MODE (op) == VOIDmode
15045 ? mode : GET_MODE (op), byte);
15050 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
15051 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
15052 is the expression of the binary operation. The output may either be
15053 emitted here, or returned to the caller, like all output_* functions.
15055 There is no guarantee that the operands are the same mode, as they
15056 might be within FLOAT or FLOAT_EXTEND expressions. */
15058 #ifndef SYSV386_COMPAT
15059 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
15060 wants to fix the assemblers because that causes incompatibility
15061 with gcc. No-one wants to fix gcc because that causes
15062 incompatibility with assemblers... You can use the option of
15063 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
15064 #define SYSV386_COMPAT 1
15065 #endif
15067 const char *
15068 output_387_binary_op (rtx insn, rtx *operands)
15070 static char buf[40];
15071 const char *p;
15072 const char *ssep;
15073 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
15075 #ifdef ENABLE_CHECKING
15076 /* Even if we do not want to check the inputs, this documents input
15077 constraints. Which helps in understanding the following code. */
15078 if (STACK_REG_P (operands[0])
15079 && ((REG_P (operands[1])
15080 && REGNO (operands[0]) == REGNO (operands[1])
15081 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
15082 || (REG_P (operands[2])
15083 && REGNO (operands[0]) == REGNO (operands[2])
15084 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
15085 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
15086 ; /* ok */
15087 else
15088 gcc_assert (is_sse);
15089 #endif
15091 switch (GET_CODE (operands[3]))
15093 case PLUS:
15094 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15095 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15096 p = "fiadd";
15097 else
15098 p = "fadd";
15099 ssep = "vadd";
15100 break;
15102 case MINUS:
15103 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15104 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15105 p = "fisub";
15106 else
15107 p = "fsub";
15108 ssep = "vsub";
15109 break;
15111 case MULT:
15112 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15113 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15114 p = "fimul";
15115 else
15116 p = "fmul";
15117 ssep = "vmul";
15118 break;
15120 case DIV:
15121 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15122 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15123 p = "fidiv";
15124 else
15125 p = "fdiv";
15126 ssep = "vdiv";
15127 break;
15129 default:
15130 gcc_unreachable ();
15133 if (is_sse)
15135 if (TARGET_AVX)
15137 strcpy (buf, ssep);
15138 if (GET_MODE (operands[0]) == SFmode)
15139 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
15140 else
15141 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
15143 else
15145 strcpy (buf, ssep + 1);
15146 if (GET_MODE (operands[0]) == SFmode)
15147 strcat (buf, "ss\t{%2, %0|%0, %2}");
15148 else
15149 strcat (buf, "sd\t{%2, %0|%0, %2}");
15151 return buf;
15153 strcpy (buf, p);
15155 switch (GET_CODE (operands[3]))
15157 case MULT:
15158 case PLUS:
15159 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
15161 rtx temp = operands[2];
15162 operands[2] = operands[1];
15163 operands[1] = temp;
15166 /* know operands[0] == operands[1]. */
15168 if (MEM_P (operands[2]))
15170 p = "%Z2\t%2";
15171 break;
15174 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15176 if (STACK_TOP_P (operands[0]))
15177 /* How is it that we are storing to a dead operand[2]?
15178 Well, presumably operands[1] is dead too. We can't
15179 store the result to st(0) as st(0) gets popped on this
15180 instruction. Instead store to operands[2] (which I
15181 think has to be st(1)). st(1) will be popped later.
15182 gcc <= 2.8.1 didn't have this check and generated
15183 assembly code that the Unixware assembler rejected. */
15184 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15185 else
15186 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15187 break;
15190 if (STACK_TOP_P (operands[0]))
15191 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15192 else
15193 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15194 break;
15196 case MINUS:
15197 case DIV:
15198 if (MEM_P (operands[1]))
15200 p = "r%Z1\t%1";
15201 break;
15204 if (MEM_P (operands[2]))
15206 p = "%Z2\t%2";
15207 break;
15210 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15212 #if SYSV386_COMPAT
15213 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
15214 derived assemblers, confusingly reverse the direction of
15215 the operation for fsub{r} and fdiv{r} when the
15216 destination register is not st(0). The Intel assembler
15217 doesn't have this brain damage. Read !SYSV386_COMPAT to
15218 figure out what the hardware really does. */
15219 if (STACK_TOP_P (operands[0]))
15220 p = "{p\t%0, %2|rp\t%2, %0}";
15221 else
15222 p = "{rp\t%2, %0|p\t%0, %2}";
15223 #else
15224 if (STACK_TOP_P (operands[0]))
15225 /* As above for fmul/fadd, we can't store to st(0). */
15226 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15227 else
15228 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15229 #endif
15230 break;
15233 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
15235 #if SYSV386_COMPAT
15236 if (STACK_TOP_P (operands[0]))
15237 p = "{rp\t%0, %1|p\t%1, %0}";
15238 else
15239 p = "{p\t%1, %0|rp\t%0, %1}";
15240 #else
15241 if (STACK_TOP_P (operands[0]))
15242 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
15243 else
15244 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
15245 #endif
15246 break;
15249 if (STACK_TOP_P (operands[0]))
15251 if (STACK_TOP_P (operands[1]))
15252 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15253 else
15254 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
15255 break;
15257 else if (STACK_TOP_P (operands[1]))
15259 #if SYSV386_COMPAT
15260 p = "{\t%1, %0|r\t%0, %1}";
15261 #else
15262 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
15263 #endif
15265 else
15267 #if SYSV386_COMPAT
15268 p = "{r\t%2, %0|\t%0, %2}";
15269 #else
15270 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15271 #endif
15273 break;
15275 default:
15276 gcc_unreachable ();
15279 strcat (buf, p);
15280 return buf;
15283 /* Check if a 256bit AVX register is referenced inside of EXP. */
15285 static int
15286 ix86_check_avx256_register (rtx *pexp, void *data ATTRIBUTE_UNUSED)
15288 rtx exp = *pexp;
15290 if (GET_CODE (exp) == SUBREG)
15291 exp = SUBREG_REG (exp);
15293 if (REG_P (exp)
15294 && VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp)))
15295 return 1;
15297 return 0;
15300 /* Return needed mode for entity in optimize_mode_switching pass. */
15302 static int
15303 ix86_avx_u128_mode_needed (rtx insn)
15305 if (CALL_P (insn))
15307 rtx link;
15309 /* Needed mode is set to AVX_U128_CLEAN if there are
15310 no 256bit modes used in function arguments. */
15311 for (link = CALL_INSN_FUNCTION_USAGE (insn);
15312 link;
15313 link = XEXP (link, 1))
15315 if (GET_CODE (XEXP (link, 0)) == USE)
15317 rtx arg = XEXP (XEXP (link, 0), 0);
15319 if (ix86_check_avx256_register (&arg, NULL))
15320 return AVX_U128_ANY;
15324 return AVX_U128_CLEAN;
15327 /* Require DIRTY mode if a 256bit AVX register is referenced. Hardware
15328 changes state only when a 256bit register is written to, but we need
15329 to prevent the compiler from moving optimal insertion point above
15330 eventual read from 256bit register. */
15331 if (for_each_rtx (&PATTERN (insn), ix86_check_avx256_register, NULL))
15332 return AVX_U128_DIRTY;
15334 return AVX_U128_ANY;
15337 /* Return mode that i387 must be switched into
15338 prior to the execution of insn. */
15340 static int
15341 ix86_i387_mode_needed (int entity, rtx insn)
15343 enum attr_i387_cw mode;
15345 /* The mode UNINITIALIZED is used to store control word after a
15346 function call or ASM pattern. The mode ANY specify that function
15347 has no requirements on the control word and make no changes in the
15348 bits we are interested in. */
15350 if (CALL_P (insn)
15351 || (NONJUMP_INSN_P (insn)
15352 && (asm_noperands (PATTERN (insn)) >= 0
15353 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
15354 return I387_CW_UNINITIALIZED;
15356 if (recog_memoized (insn) < 0)
15357 return I387_CW_ANY;
15359 mode = get_attr_i387_cw (insn);
15361 switch (entity)
15363 case I387_TRUNC:
15364 if (mode == I387_CW_TRUNC)
15365 return mode;
15366 break;
15368 case I387_FLOOR:
15369 if (mode == I387_CW_FLOOR)
15370 return mode;
15371 break;
15373 case I387_CEIL:
15374 if (mode == I387_CW_CEIL)
15375 return mode;
15376 break;
15378 case I387_MASK_PM:
15379 if (mode == I387_CW_MASK_PM)
15380 return mode;
15381 break;
15383 default:
15384 gcc_unreachable ();
15387 return I387_CW_ANY;
15390 /* Return mode that entity must be switched into
15391 prior to the execution of insn. */
15394 ix86_mode_needed (int entity, rtx insn)
15396 switch (entity)
15398 case AVX_U128:
15399 return ix86_avx_u128_mode_needed (insn);
15400 case I387_TRUNC:
15401 case I387_FLOOR:
15402 case I387_CEIL:
15403 case I387_MASK_PM:
15404 return ix86_i387_mode_needed (entity, insn);
15405 default:
15406 gcc_unreachable ();
15408 return 0;
15411 /* Check if a 256bit AVX register is referenced in stores. */
15413 static void
15414 ix86_check_avx256_stores (rtx dest, const_rtx set ATTRIBUTE_UNUSED, void *data)
15416 if (ix86_check_avx256_register (&dest, NULL))
15418 bool *used = (bool *) data;
15419 *used = true;
15423 /* Calculate mode of upper 128bit AVX registers after the insn. */
15425 static int
15426 ix86_avx_u128_mode_after (int mode, rtx insn)
15428 rtx pat = PATTERN (insn);
15430 if (vzeroupper_operation (pat, VOIDmode)
15431 || vzeroall_operation (pat, VOIDmode))
15432 return AVX_U128_CLEAN;
15434 /* We know that state is clean after CALL insn if there are no
15435 256bit registers used in the function return register. */
15436 if (CALL_P (insn))
15438 bool avx_reg256_found = false;
15439 note_stores (pat, ix86_check_avx256_stores, &avx_reg256_found);
15440 if (!avx_reg256_found)
15441 return AVX_U128_CLEAN;
15444 /* Otherwise, return current mode. Remember that if insn
15445 references AVX 256bit registers, the mode was already changed
15446 to DIRTY from MODE_NEEDED. */
15447 return mode;
15450 /* Return the mode that an insn results in. */
15453 ix86_mode_after (int entity, int mode, rtx insn)
15455 switch (entity)
15457 case AVX_U128:
15458 return ix86_avx_u128_mode_after (mode, insn);
15459 case I387_TRUNC:
15460 case I387_FLOOR:
15461 case I387_CEIL:
15462 case I387_MASK_PM:
15463 return mode;
15464 default:
15465 gcc_unreachable ();
15469 static int
15470 ix86_avx_u128_mode_entry (void)
15472 tree arg;
15474 /* Entry mode is set to AVX_U128_DIRTY if there are
15475 256bit modes used in function arguments. */
15476 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
15477 arg = TREE_CHAIN (arg))
15479 rtx incoming = DECL_INCOMING_RTL (arg);
15481 if (incoming && ix86_check_avx256_register (&incoming, NULL))
15482 return AVX_U128_DIRTY;
15485 return AVX_U128_CLEAN;
15488 /* Return a mode that ENTITY is assumed to be
15489 switched to at function entry. */
15492 ix86_mode_entry (int entity)
15494 switch (entity)
15496 case AVX_U128:
15497 return ix86_avx_u128_mode_entry ();
15498 case I387_TRUNC:
15499 case I387_FLOOR:
15500 case I387_CEIL:
15501 case I387_MASK_PM:
15502 return I387_CW_ANY;
15503 default:
15504 gcc_unreachable ();
15508 static int
15509 ix86_avx_u128_mode_exit (void)
15511 rtx reg = crtl->return_rtx;
15513 /* Exit mode is set to AVX_U128_DIRTY if there are
15514 256bit modes used in the function return register. */
15515 if (reg && ix86_check_avx256_register (&reg, NULL))
15516 return AVX_U128_DIRTY;
15518 return AVX_U128_CLEAN;
15521 /* Return a mode that ENTITY is assumed to be
15522 switched to at function exit. */
15525 ix86_mode_exit (int entity)
15527 switch (entity)
15529 case AVX_U128:
15530 return ix86_avx_u128_mode_exit ();
15531 case I387_TRUNC:
15532 case I387_FLOOR:
15533 case I387_CEIL:
15534 case I387_MASK_PM:
15535 return I387_CW_ANY;
15536 default:
15537 gcc_unreachable ();
15541 /* Output code to initialize control word copies used by trunc?f?i and
15542 rounding patterns. CURRENT_MODE is set to current control word,
15543 while NEW_MODE is set to new control word. */
15545 static void
15546 emit_i387_cw_initialization (int mode)
15548 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
15549 rtx new_mode;
15551 enum ix86_stack_slot slot;
15553 rtx reg = gen_reg_rtx (HImode);
15555 emit_insn (gen_x86_fnstcw_1 (stored_mode));
15556 emit_move_insn (reg, copy_rtx (stored_mode));
15558 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
15559 || optimize_function_for_size_p (cfun))
15561 switch (mode)
15563 case I387_CW_TRUNC:
15564 /* round toward zero (truncate) */
15565 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
15566 slot = SLOT_CW_TRUNC;
15567 break;
15569 case I387_CW_FLOOR:
15570 /* round down toward -oo */
15571 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15572 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
15573 slot = SLOT_CW_FLOOR;
15574 break;
15576 case I387_CW_CEIL:
15577 /* round up toward +oo */
15578 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15579 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
15580 slot = SLOT_CW_CEIL;
15581 break;
15583 case I387_CW_MASK_PM:
15584 /* mask precision exception for nearbyint() */
15585 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15586 slot = SLOT_CW_MASK_PM;
15587 break;
15589 default:
15590 gcc_unreachable ();
15593 else
15595 switch (mode)
15597 case I387_CW_TRUNC:
15598 /* round toward zero (truncate) */
15599 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
15600 slot = SLOT_CW_TRUNC;
15601 break;
15603 case I387_CW_FLOOR:
15604 /* round down toward -oo */
15605 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
15606 slot = SLOT_CW_FLOOR;
15607 break;
15609 case I387_CW_CEIL:
15610 /* round up toward +oo */
15611 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
15612 slot = SLOT_CW_CEIL;
15613 break;
15615 case I387_CW_MASK_PM:
15616 /* mask precision exception for nearbyint() */
15617 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15618 slot = SLOT_CW_MASK_PM;
15619 break;
15621 default:
15622 gcc_unreachable ();
15626 gcc_assert (slot < MAX_386_STACK_LOCALS);
15628 new_mode = assign_386_stack_local (HImode, slot);
15629 emit_move_insn (new_mode, reg);
15632 /* Emit vzeroupper. */
15634 void
15635 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
15637 int i;
15639 /* Cancel automatic vzeroupper insertion if there are
15640 live call-saved SSE registers at the insertion point. */
15642 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
15643 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
15644 return;
15646 if (TARGET_64BIT)
15647 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
15648 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
15649 return;
15651 emit_insn (gen_avx_vzeroupper ());
15654 /* Generate one or more insns to set ENTITY to MODE. */
15656 void
15657 ix86_emit_mode_set (int entity, int mode, HARD_REG_SET regs_live)
15659 switch (entity)
15661 case AVX_U128:
15662 if (mode == AVX_U128_CLEAN)
15663 ix86_avx_emit_vzeroupper (regs_live);
15664 break;
15665 case I387_TRUNC:
15666 case I387_FLOOR:
15667 case I387_CEIL:
15668 case I387_MASK_PM:
15669 if (mode != I387_CW_ANY
15670 && mode != I387_CW_UNINITIALIZED)
15671 emit_i387_cw_initialization (mode);
15672 break;
15673 default:
15674 gcc_unreachable ();
15678 /* Output code for INSN to convert a float to a signed int. OPERANDS
15679 are the insn operands. The output may be [HSD]Imode and the input
15680 operand may be [SDX]Fmode. */
15682 const char *
15683 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
15685 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15686 int dimode_p = GET_MODE (operands[0]) == DImode;
15687 int round_mode = get_attr_i387_cw (insn);
15689 /* Jump through a hoop or two for DImode, since the hardware has no
15690 non-popping instruction. We used to do this a different way, but
15691 that was somewhat fragile and broke with post-reload splitters. */
15692 if ((dimode_p || fisttp) && !stack_top_dies)
15693 output_asm_insn ("fld\t%y1", operands);
15695 gcc_assert (STACK_TOP_P (operands[1]));
15696 gcc_assert (MEM_P (operands[0]));
15697 gcc_assert (GET_MODE (operands[1]) != TFmode);
15699 if (fisttp)
15700 output_asm_insn ("fisttp%Z0\t%0", operands);
15701 else
15703 if (round_mode != I387_CW_ANY)
15704 output_asm_insn ("fldcw\t%3", operands);
15705 if (stack_top_dies || dimode_p)
15706 output_asm_insn ("fistp%Z0\t%0", operands);
15707 else
15708 output_asm_insn ("fist%Z0\t%0", operands);
15709 if (round_mode != I387_CW_ANY)
15710 output_asm_insn ("fldcw\t%2", operands);
15713 return "";
15716 /* Output code for x87 ffreep insn. The OPNO argument, which may only
15717 have the values zero or one, indicates the ffreep insn's operand
15718 from the OPERANDS array. */
15720 static const char *
15721 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
15723 if (TARGET_USE_FFREEP)
15724 #ifdef HAVE_AS_IX86_FFREEP
15725 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
15726 #else
15728 static char retval[32];
15729 int regno = REGNO (operands[opno]);
15731 gcc_assert (STACK_REGNO_P (regno));
15733 regno -= FIRST_STACK_REG;
15735 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
15736 return retval;
15738 #endif
15740 return opno ? "fstp\t%y1" : "fstp\t%y0";
15744 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
15745 should be used. UNORDERED_P is true when fucom should be used. */
15747 const char *
15748 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
15750 int stack_top_dies;
15751 rtx cmp_op0, cmp_op1;
15752 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
15754 if (eflags_p)
15756 cmp_op0 = operands[0];
15757 cmp_op1 = operands[1];
15759 else
15761 cmp_op0 = operands[1];
15762 cmp_op1 = operands[2];
15765 if (is_sse)
15767 if (GET_MODE (operands[0]) == SFmode)
15768 if (unordered_p)
15769 return "%vucomiss\t{%1, %0|%0, %1}";
15770 else
15771 return "%vcomiss\t{%1, %0|%0, %1}";
15772 else
15773 if (unordered_p)
15774 return "%vucomisd\t{%1, %0|%0, %1}";
15775 else
15776 return "%vcomisd\t{%1, %0|%0, %1}";
15779 gcc_assert (STACK_TOP_P (cmp_op0));
15781 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15783 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
15785 if (stack_top_dies)
15787 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
15788 return output_387_ffreep (operands, 1);
15790 else
15791 return "ftst\n\tfnstsw\t%0";
15794 if (STACK_REG_P (cmp_op1)
15795 && stack_top_dies
15796 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
15797 && REGNO (cmp_op1) != FIRST_STACK_REG)
15799 /* If both the top of the 387 stack dies, and the other operand
15800 is also a stack register that dies, then this must be a
15801 `fcompp' float compare */
15803 if (eflags_p)
15805 /* There is no double popping fcomi variant. Fortunately,
15806 eflags is immune from the fstp's cc clobbering. */
15807 if (unordered_p)
15808 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
15809 else
15810 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
15811 return output_387_ffreep (operands, 0);
15813 else
15815 if (unordered_p)
15816 return "fucompp\n\tfnstsw\t%0";
15817 else
15818 return "fcompp\n\tfnstsw\t%0";
15821 else
15823 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
15825 static const char * const alt[16] =
15827 "fcom%Z2\t%y2\n\tfnstsw\t%0",
15828 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
15829 "fucom%Z2\t%y2\n\tfnstsw\t%0",
15830 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
15832 "ficom%Z2\t%y2\n\tfnstsw\t%0",
15833 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
15834 NULL,
15835 NULL,
15837 "fcomi\t{%y1, %0|%0, %y1}",
15838 "fcomip\t{%y1, %0|%0, %y1}",
15839 "fucomi\t{%y1, %0|%0, %y1}",
15840 "fucomip\t{%y1, %0|%0, %y1}",
15842 NULL,
15843 NULL,
15844 NULL,
15845 NULL
15848 int mask;
15849 const char *ret;
15851 mask = eflags_p << 3;
15852 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
15853 mask |= unordered_p << 1;
15854 mask |= stack_top_dies;
15856 gcc_assert (mask < 16);
15857 ret = alt[mask];
15858 gcc_assert (ret);
15860 return ret;
15864 void
15865 ix86_output_addr_vec_elt (FILE *file, int value)
15867 const char *directive = ASM_LONG;
15869 #ifdef ASM_QUAD
15870 if (TARGET_LP64)
15871 directive = ASM_QUAD;
15872 #else
15873 gcc_assert (!TARGET_64BIT);
15874 #endif
15876 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
15879 void
15880 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
15882 const char *directive = ASM_LONG;
15884 #ifdef ASM_QUAD
15885 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
15886 directive = ASM_QUAD;
15887 #else
15888 gcc_assert (!TARGET_64BIT);
15889 #endif
15890 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
15891 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
15892 fprintf (file, "%s%s%d-%s%d\n",
15893 directive, LPREFIX, value, LPREFIX, rel);
15894 else if (HAVE_AS_GOTOFF_IN_DATA)
15895 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
15896 #if TARGET_MACHO
15897 else if (TARGET_MACHO)
15899 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
15900 machopic_output_function_base_name (file);
15901 putc ('\n', file);
15903 #endif
15904 else
15905 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
15906 GOT_SYMBOL_NAME, LPREFIX, value);
15909 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
15910 for the target. */
15912 void
15913 ix86_expand_clear (rtx dest)
15915 rtx tmp;
15917 /* We play register width games, which are only valid after reload. */
15918 gcc_assert (reload_completed);
15920 /* Avoid HImode and its attendant prefix byte. */
15921 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
15922 dest = gen_rtx_REG (SImode, REGNO (dest));
15923 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
15925 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
15926 if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
15928 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
15929 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
15932 emit_insn (tmp);
15935 /* X is an unchanging MEM. If it is a constant pool reference, return
15936 the constant pool rtx, else NULL. */
15939 maybe_get_pool_constant (rtx x)
15941 x = ix86_delegitimize_address (XEXP (x, 0));
15943 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
15944 return get_pool_constant (x);
15946 return NULL_RTX;
15949 void
15950 ix86_expand_move (enum machine_mode mode, rtx operands[])
15952 rtx op0, op1;
15953 enum tls_model model;
15955 op0 = operands[0];
15956 op1 = operands[1];
15958 if (GET_CODE (op1) == SYMBOL_REF)
15960 model = SYMBOL_REF_TLS_MODEL (op1);
15961 if (model)
15963 op1 = legitimize_tls_address (op1, model, true);
15964 op1 = force_operand (op1, op0);
15965 if (op1 == op0)
15966 return;
15967 op1 = convert_to_mode (mode, op1, 1);
15969 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15970 && SYMBOL_REF_DLLIMPORT_P (op1))
15971 op1 = legitimize_dllimport_symbol (op1, false);
15973 else if (GET_CODE (op1) == CONST
15974 && GET_CODE (XEXP (op1, 0)) == PLUS
15975 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
15977 rtx addend = XEXP (XEXP (op1, 0), 1);
15978 rtx symbol = XEXP (XEXP (op1, 0), 0);
15979 rtx tmp = NULL;
15981 model = SYMBOL_REF_TLS_MODEL (symbol);
15982 if (model)
15983 tmp = legitimize_tls_address (symbol, model, true);
15984 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15985 && SYMBOL_REF_DLLIMPORT_P (symbol))
15986 tmp = legitimize_dllimport_symbol (symbol, true);
15988 if (tmp)
15990 tmp = force_operand (tmp, NULL);
15991 tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
15992 op0, 1, OPTAB_DIRECT);
15993 if (tmp == op0)
15994 return;
15995 op1 = convert_to_mode (mode, tmp, 1);
15999 if ((flag_pic || MACHOPIC_INDIRECT)
16000 && symbolic_operand (op1, mode))
16002 if (TARGET_MACHO && !TARGET_64BIT)
16004 #if TARGET_MACHO
16005 /* dynamic-no-pic */
16006 if (MACHOPIC_INDIRECT)
16008 rtx temp = ((reload_in_progress
16009 || ((op0 && REG_P (op0))
16010 && mode == Pmode))
16011 ? op0 : gen_reg_rtx (Pmode));
16012 op1 = machopic_indirect_data_reference (op1, temp);
16013 if (MACHOPIC_PURE)
16014 op1 = machopic_legitimize_pic_address (op1, mode,
16015 temp == op1 ? 0 : temp);
16017 if (op0 != op1 && GET_CODE (op0) != MEM)
16019 rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
16020 emit_insn (insn);
16021 return;
16023 if (GET_CODE (op0) == MEM)
16024 op1 = force_reg (Pmode, op1);
16025 else
16027 rtx temp = op0;
16028 if (GET_CODE (temp) != REG)
16029 temp = gen_reg_rtx (Pmode);
16030 temp = legitimize_pic_address (op1, temp);
16031 if (temp == op0)
16032 return;
16033 op1 = temp;
16035 /* dynamic-no-pic */
16036 #endif
16038 else
16040 if (MEM_P (op0))
16041 op1 = force_reg (mode, op1);
16042 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
16044 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
16045 op1 = legitimize_pic_address (op1, reg);
16046 if (op0 == op1)
16047 return;
16048 op1 = convert_to_mode (mode, op1, 1);
16052 else
16054 if (MEM_P (op0)
16055 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
16056 || !push_operand (op0, mode))
16057 && MEM_P (op1))
16058 op1 = force_reg (mode, op1);
16060 if (push_operand (op0, mode)
16061 && ! general_no_elim_operand (op1, mode))
16062 op1 = copy_to_mode_reg (mode, op1);
16064 /* Force large constants in 64bit compilation into register
16065 to get them CSEed. */
16066 if (can_create_pseudo_p ()
16067 && (mode == DImode) && TARGET_64BIT
16068 && immediate_operand (op1, mode)
16069 && !x86_64_zext_immediate_operand (op1, VOIDmode)
16070 && !register_operand (op0, mode)
16071 && optimize)
16072 op1 = copy_to_mode_reg (mode, op1);
16074 if (can_create_pseudo_p ()
16075 && FLOAT_MODE_P (mode)
16076 && GET_CODE (op1) == CONST_DOUBLE)
16078 /* If we are loading a floating point constant to a register,
16079 force the value to memory now, since we'll get better code
16080 out the back end. */
16082 op1 = validize_mem (force_const_mem (mode, op1));
16083 if (!register_operand (op0, mode))
16085 rtx temp = gen_reg_rtx (mode);
16086 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
16087 emit_move_insn (op0, temp);
16088 return;
16093 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16096 void
16097 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
16099 rtx op0 = operands[0], op1 = operands[1];
16100 unsigned int align = GET_MODE_ALIGNMENT (mode);
16102 /* Force constants other than zero into memory. We do not know how
16103 the instructions used to build constants modify the upper 64 bits
16104 of the register, once we have that information we may be able
16105 to handle some of them more efficiently. */
16106 if (can_create_pseudo_p ()
16107 && register_operand (op0, mode)
16108 && (CONSTANT_P (op1)
16109 || (GET_CODE (op1) == SUBREG
16110 && CONSTANT_P (SUBREG_REG (op1))))
16111 && !standard_sse_constant_p (op1))
16112 op1 = validize_mem (force_const_mem (mode, op1));
16114 /* We need to check memory alignment for SSE mode since attribute
16115 can make operands unaligned. */
16116 if (can_create_pseudo_p ()
16117 && SSE_REG_MODE_P (mode)
16118 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
16119 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
16121 rtx tmp[2];
16123 /* ix86_expand_vector_move_misalign() does not like constants ... */
16124 if (CONSTANT_P (op1)
16125 || (GET_CODE (op1) == SUBREG
16126 && CONSTANT_P (SUBREG_REG (op1))))
16127 op1 = validize_mem (force_const_mem (mode, op1));
16129 /* ... nor both arguments in memory. */
16130 if (!register_operand (op0, mode)
16131 && !register_operand (op1, mode))
16132 op1 = force_reg (mode, op1);
16134 tmp[0] = op0; tmp[1] = op1;
16135 ix86_expand_vector_move_misalign (mode, tmp);
16136 return;
16139 /* Make operand1 a register if it isn't already. */
16140 if (can_create_pseudo_p ()
16141 && !register_operand (op0, mode)
16142 && !register_operand (op1, mode))
16144 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
16145 return;
16148 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16151 /* Split 32-byte AVX unaligned load and store if needed. */
16153 static void
16154 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
16156 rtx m;
16157 rtx (*extract) (rtx, rtx, rtx);
16158 rtx (*load_unaligned) (rtx, rtx);
16159 rtx (*store_unaligned) (rtx, rtx);
16160 enum machine_mode mode;
16162 switch (GET_MODE (op0))
16164 default:
16165 gcc_unreachable ();
16166 case V32QImode:
16167 extract = gen_avx_vextractf128v32qi;
16168 load_unaligned = gen_avx_loaddqu256;
16169 store_unaligned = gen_avx_storedqu256;
16170 mode = V16QImode;
16171 break;
16172 case V8SFmode:
16173 extract = gen_avx_vextractf128v8sf;
16174 load_unaligned = gen_avx_loadups256;
16175 store_unaligned = gen_avx_storeups256;
16176 mode = V4SFmode;
16177 break;
16178 case V4DFmode:
16179 extract = gen_avx_vextractf128v4df;
16180 load_unaligned = gen_avx_loadupd256;
16181 store_unaligned = gen_avx_storeupd256;
16182 mode = V2DFmode;
16183 break;
16186 if (MEM_P (op1))
16188 if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
16190 rtx r = gen_reg_rtx (mode);
16191 m = adjust_address (op1, mode, 0);
16192 emit_move_insn (r, m);
16193 m = adjust_address (op1, mode, 16);
16194 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
16195 emit_move_insn (op0, r);
16197 else
16198 emit_insn (load_unaligned (op0, op1));
16200 else if (MEM_P (op0))
16202 if (TARGET_AVX256_SPLIT_UNALIGNED_STORE)
16204 m = adjust_address (op0, mode, 0);
16205 emit_insn (extract (m, op1, const0_rtx));
16206 m = adjust_address (op0, mode, 16);
16207 emit_insn (extract (m, op1, const1_rtx));
16209 else
16210 emit_insn (store_unaligned (op0, op1));
16212 else
16213 gcc_unreachable ();
16216 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
16217 straight to ix86_expand_vector_move. */
16218 /* Code generation for scalar reg-reg moves of single and double precision data:
16219 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
16220 movaps reg, reg
16221 else
16222 movss reg, reg
16223 if (x86_sse_partial_reg_dependency == true)
16224 movapd reg, reg
16225 else
16226 movsd reg, reg
16228 Code generation for scalar loads of double precision data:
16229 if (x86_sse_split_regs == true)
16230 movlpd mem, reg (gas syntax)
16231 else
16232 movsd mem, reg
16234 Code generation for unaligned packed loads of single precision data
16235 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
16236 if (x86_sse_unaligned_move_optimal)
16237 movups mem, reg
16239 if (x86_sse_partial_reg_dependency == true)
16241 xorps reg, reg
16242 movlps mem, reg
16243 movhps mem+8, reg
16245 else
16247 movlps mem, reg
16248 movhps mem+8, reg
16251 Code generation for unaligned packed loads of double precision data
16252 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
16253 if (x86_sse_unaligned_move_optimal)
16254 movupd mem, reg
16256 if (x86_sse_split_regs == true)
16258 movlpd mem, reg
16259 movhpd mem+8, reg
16261 else
16263 movsd mem, reg
16264 movhpd mem+8, reg
16268 void
16269 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
16271 rtx op0, op1, m;
16273 op0 = operands[0];
16274 op1 = operands[1];
16276 if (TARGET_AVX
16277 && GET_MODE_SIZE (mode) == 32)
16279 switch (GET_MODE_CLASS (mode))
16281 case MODE_VECTOR_INT:
16282 case MODE_INT:
16283 op0 = gen_lowpart (V32QImode, op0);
16284 op1 = gen_lowpart (V32QImode, op1);
16285 /* FALLTHRU */
16287 case MODE_VECTOR_FLOAT:
16288 ix86_avx256_split_vector_move_misalign (op0, op1);
16289 break;
16291 default:
16292 gcc_unreachable ();
16295 return;
16298 if (MEM_P (op1))
16300 /* ??? If we have typed data, then it would appear that using
16301 movdqu is the only way to get unaligned data loaded with
16302 integer type. */
16303 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16305 op0 = gen_lowpart (V16QImode, op0);
16306 op1 = gen_lowpart (V16QImode, op1);
16307 /* We will eventually emit movups based on insn attributes. */
16308 emit_insn (gen_sse2_loaddqu (op0, op1));
16310 else if (TARGET_SSE2 && mode == V2DFmode)
16312 rtx zero;
16314 if (TARGET_AVX
16315 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16316 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16317 || optimize_function_for_size_p (cfun))
16319 /* We will eventually emit movups based on insn attributes. */
16320 emit_insn (gen_sse2_loadupd (op0, op1));
16321 return;
16324 /* When SSE registers are split into halves, we can avoid
16325 writing to the top half twice. */
16326 if (TARGET_SSE_SPLIT_REGS)
16328 emit_clobber (op0);
16329 zero = op0;
16331 else
16333 /* ??? Not sure about the best option for the Intel chips.
16334 The following would seem to satisfy; the register is
16335 entirely cleared, breaking the dependency chain. We
16336 then store to the upper half, with a dependency depth
16337 of one. A rumor has it that Intel recommends two movsd
16338 followed by an unpacklpd, but this is unconfirmed. And
16339 given that the dependency depth of the unpacklpd would
16340 still be one, I'm not sure why this would be better. */
16341 zero = CONST0_RTX (V2DFmode);
16344 m = adjust_address (op1, DFmode, 0);
16345 emit_insn (gen_sse2_loadlpd (op0, zero, m));
16346 m = adjust_address (op1, DFmode, 8);
16347 emit_insn (gen_sse2_loadhpd (op0, op0, m));
16349 else
16351 if (TARGET_AVX
16352 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16353 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16354 || optimize_function_for_size_p (cfun))
16356 op0 = gen_lowpart (V4SFmode, op0);
16357 op1 = gen_lowpart (V4SFmode, op1);
16358 emit_insn (gen_sse_loadups (op0, op1));
16359 return;
16362 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
16363 emit_move_insn (op0, CONST0_RTX (mode));
16364 else
16365 emit_clobber (op0);
16367 if (mode != V4SFmode)
16368 op0 = gen_lowpart (V4SFmode, op0);
16370 m = adjust_address (op1, V2SFmode, 0);
16371 emit_insn (gen_sse_loadlps (op0, op0, m));
16372 m = adjust_address (op1, V2SFmode, 8);
16373 emit_insn (gen_sse_loadhps (op0, op0, m));
16376 else if (MEM_P (op0))
16378 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16380 op0 = gen_lowpart (V16QImode, op0);
16381 op1 = gen_lowpart (V16QImode, op1);
16382 /* We will eventually emit movups based on insn attributes. */
16383 emit_insn (gen_sse2_storedqu (op0, op1));
16385 else if (TARGET_SSE2 && mode == V2DFmode)
16387 if (TARGET_AVX
16388 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16389 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16390 || optimize_function_for_size_p (cfun))
16391 /* We will eventually emit movups based on insn attributes. */
16392 emit_insn (gen_sse2_storeupd (op0, op1));
16393 else
16395 m = adjust_address (op0, DFmode, 0);
16396 emit_insn (gen_sse2_storelpd (m, op1));
16397 m = adjust_address (op0, DFmode, 8);
16398 emit_insn (gen_sse2_storehpd (m, op1));
16401 else
16403 if (mode != V4SFmode)
16404 op1 = gen_lowpart (V4SFmode, op1);
16406 if (TARGET_AVX
16407 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16408 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16409 || optimize_function_for_size_p (cfun))
16411 op0 = gen_lowpart (V4SFmode, op0);
16412 emit_insn (gen_sse_storeups (op0, op1));
16414 else
16416 m = adjust_address (op0, V2SFmode, 0);
16417 emit_insn (gen_sse_storelps (m, op1));
16418 m = adjust_address (op0, V2SFmode, 8);
16419 emit_insn (gen_sse_storehps (m, op1));
16423 else
16424 gcc_unreachable ();
16427 /* Expand a push in MODE. This is some mode for which we do not support
16428 proper push instructions, at least from the registers that we expect
16429 the value to live in. */
16431 void
16432 ix86_expand_push (enum machine_mode mode, rtx x)
16434 rtx tmp;
16436 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
16437 GEN_INT (-GET_MODE_SIZE (mode)),
16438 stack_pointer_rtx, 1, OPTAB_DIRECT);
16439 if (tmp != stack_pointer_rtx)
16440 emit_move_insn (stack_pointer_rtx, tmp);
16442 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
16444 /* When we push an operand onto stack, it has to be aligned at least
16445 at the function argument boundary. However since we don't have
16446 the argument type, we can't determine the actual argument
16447 boundary. */
16448 emit_move_insn (tmp, x);
16451 /* Helper function of ix86_fixup_binary_operands to canonicalize
16452 operand order. Returns true if the operands should be swapped. */
16454 static bool
16455 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
16456 rtx operands[])
16458 rtx dst = operands[0];
16459 rtx src1 = operands[1];
16460 rtx src2 = operands[2];
16462 /* If the operation is not commutative, we can't do anything. */
16463 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
16464 return false;
16466 /* Highest priority is that src1 should match dst. */
16467 if (rtx_equal_p (dst, src1))
16468 return false;
16469 if (rtx_equal_p (dst, src2))
16470 return true;
16472 /* Next highest priority is that immediate constants come second. */
16473 if (immediate_operand (src2, mode))
16474 return false;
16475 if (immediate_operand (src1, mode))
16476 return true;
16478 /* Lowest priority is that memory references should come second. */
16479 if (MEM_P (src2))
16480 return false;
16481 if (MEM_P (src1))
16482 return true;
16484 return false;
16488 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
16489 destination to use for the operation. If different from the true
16490 destination in operands[0], a copy operation will be required. */
16493 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
16494 rtx operands[])
16496 rtx dst = operands[0];
16497 rtx src1 = operands[1];
16498 rtx src2 = operands[2];
16500 /* Canonicalize operand order. */
16501 if (ix86_swap_binary_operands_p (code, mode, operands))
16503 rtx temp;
16505 /* It is invalid to swap operands of different modes. */
16506 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
16508 temp = src1;
16509 src1 = src2;
16510 src2 = temp;
16513 /* Both source operands cannot be in memory. */
16514 if (MEM_P (src1) && MEM_P (src2))
16516 /* Optimization: Only read from memory once. */
16517 if (rtx_equal_p (src1, src2))
16519 src2 = force_reg (mode, src2);
16520 src1 = src2;
16522 else
16523 src2 = force_reg (mode, src2);
16526 /* If the destination is memory, and we do not have matching source
16527 operands, do things in registers. */
16528 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16529 dst = gen_reg_rtx (mode);
16531 /* Source 1 cannot be a constant. */
16532 if (CONSTANT_P (src1))
16533 src1 = force_reg (mode, src1);
16535 /* Source 1 cannot be a non-matching memory. */
16536 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16537 src1 = force_reg (mode, src1);
16539 /* Improve address combine. */
16540 if (code == PLUS
16541 && GET_MODE_CLASS (mode) == MODE_INT
16542 && MEM_P (src2))
16543 src2 = force_reg (mode, src2);
16545 operands[1] = src1;
16546 operands[2] = src2;
16547 return dst;
16550 /* Similarly, but assume that the destination has already been
16551 set up properly. */
16553 void
16554 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
16555 enum machine_mode mode, rtx operands[])
16557 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
16558 gcc_assert (dst == operands[0]);
16561 /* Attempt to expand a binary operator. Make the expansion closer to the
16562 actual machine, then just general_operand, which will allow 3 separate
16563 memory references (one output, two input) in a single insn. */
16565 void
16566 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
16567 rtx operands[])
16569 rtx src1, src2, dst, op, clob;
16571 dst = ix86_fixup_binary_operands (code, mode, operands);
16572 src1 = operands[1];
16573 src2 = operands[2];
16575 /* Emit the instruction. */
16577 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
16578 if (reload_in_progress)
16580 /* Reload doesn't know about the flags register, and doesn't know that
16581 it doesn't want to clobber it. We can only do this with PLUS. */
16582 gcc_assert (code == PLUS);
16583 emit_insn (op);
16585 else if (reload_completed
16586 && code == PLUS
16587 && !rtx_equal_p (dst, src1))
16589 /* This is going to be an LEA; avoid splitting it later. */
16590 emit_insn (op);
16592 else
16594 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16595 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16598 /* Fix up the destination if needed. */
16599 if (dst != operands[0])
16600 emit_move_insn (operands[0], dst);
16603 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
16604 the given OPERANDS. */
16606 void
16607 ix86_expand_vector_logical_operator (enum rtx_code code, enum machine_mode mode,
16608 rtx operands[])
16610 rtx op1 = NULL_RTX, op2 = NULL_RTX;
16611 if (GET_CODE (operands[1]) == SUBREG)
16613 op1 = operands[1];
16614 op2 = operands[2];
16616 else if (GET_CODE (operands[2]) == SUBREG)
16618 op1 = operands[2];
16619 op2 = operands[1];
16621 /* Optimize (__m128i) d | (__m128i) e and similar code
16622 when d and e are float vectors into float vector logical
16623 insn. In C/C++ without using intrinsics there is no other way
16624 to express vector logical operation on float vectors than
16625 to cast them temporarily to integer vectors. */
16626 if (op1
16627 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16628 && ((GET_CODE (op2) == SUBREG || GET_CODE (op2) == CONST_VECTOR))
16629 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
16630 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
16631 && SUBREG_BYTE (op1) == 0
16632 && (GET_CODE (op2) == CONST_VECTOR
16633 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
16634 && SUBREG_BYTE (op2) == 0))
16635 && can_create_pseudo_p ())
16637 rtx dst;
16638 switch (GET_MODE (SUBREG_REG (op1)))
16640 case V4SFmode:
16641 case V8SFmode:
16642 case V2DFmode:
16643 case V4DFmode:
16644 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
16645 if (GET_CODE (op2) == CONST_VECTOR)
16647 op2 = gen_lowpart (GET_MODE (dst), op2);
16648 op2 = force_reg (GET_MODE (dst), op2);
16650 else
16652 op1 = operands[1];
16653 op2 = SUBREG_REG (operands[2]);
16654 if (!nonimmediate_operand (op2, GET_MODE (dst)))
16655 op2 = force_reg (GET_MODE (dst), op2);
16657 op1 = SUBREG_REG (op1);
16658 if (!nonimmediate_operand (op1, GET_MODE (dst)))
16659 op1 = force_reg (GET_MODE (dst), op1);
16660 emit_insn (gen_rtx_SET (VOIDmode, dst,
16661 gen_rtx_fmt_ee (code, GET_MODE (dst),
16662 op1, op2)));
16663 emit_move_insn (operands[0], gen_lowpart (mode, dst));
16664 return;
16665 default:
16666 break;
16669 if (!nonimmediate_operand (operands[1], mode))
16670 operands[1] = force_reg (mode, operands[1]);
16671 if (!nonimmediate_operand (operands[2], mode))
16672 operands[2] = force_reg (mode, operands[2]);
16673 ix86_fixup_binary_operands_no_copy (code, mode, operands);
16674 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
16675 gen_rtx_fmt_ee (code, mode, operands[1],
16676 operands[2])));
16679 /* Return TRUE or FALSE depending on whether the binary operator meets the
16680 appropriate constraints. */
16682 bool
16683 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
16684 rtx operands[3])
16686 rtx dst = operands[0];
16687 rtx src1 = operands[1];
16688 rtx src2 = operands[2];
16690 /* Both source operands cannot be in memory. */
16691 if (MEM_P (src1) && MEM_P (src2))
16692 return false;
16694 /* Canonicalize operand order for commutative operators. */
16695 if (ix86_swap_binary_operands_p (code, mode, operands))
16697 rtx temp = src1;
16698 src1 = src2;
16699 src2 = temp;
16702 /* If the destination is memory, we must have a matching source operand. */
16703 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16704 return false;
16706 /* Source 1 cannot be a constant. */
16707 if (CONSTANT_P (src1))
16708 return false;
16710 /* Source 1 cannot be a non-matching memory. */
16711 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16712 /* Support "andhi/andsi/anddi" as a zero-extending move. */
16713 return (code == AND
16714 && (mode == HImode
16715 || mode == SImode
16716 || (TARGET_64BIT && mode == DImode))
16717 && satisfies_constraint_L (src2));
16719 return true;
16722 /* Attempt to expand a unary operator. Make the expansion closer to the
16723 actual machine, then just general_operand, which will allow 2 separate
16724 memory references (one output, one input) in a single insn. */
16726 void
16727 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
16728 rtx operands[])
16730 int matching_memory;
16731 rtx src, dst, op, clob;
16733 dst = operands[0];
16734 src = operands[1];
16736 /* If the destination is memory, and we do not have matching source
16737 operands, do things in registers. */
16738 matching_memory = 0;
16739 if (MEM_P (dst))
16741 if (rtx_equal_p (dst, src))
16742 matching_memory = 1;
16743 else
16744 dst = gen_reg_rtx (mode);
16747 /* When source operand is memory, destination must match. */
16748 if (MEM_P (src) && !matching_memory)
16749 src = force_reg (mode, src);
16751 /* Emit the instruction. */
16753 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
16754 if (reload_in_progress || code == NOT)
16756 /* Reload doesn't know about the flags register, and doesn't know that
16757 it doesn't want to clobber it. */
16758 gcc_assert (code == NOT);
16759 emit_insn (op);
16761 else
16763 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16764 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16767 /* Fix up the destination if needed. */
16768 if (dst != operands[0])
16769 emit_move_insn (operands[0], dst);
16772 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
16773 divisor are within the range [0-255]. */
16775 void
16776 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
16777 bool signed_p)
16779 rtx end_label, qimode_label;
16780 rtx insn, div, mod;
16781 rtx scratch, tmp0, tmp1, tmp2;
16782 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
16783 rtx (*gen_zero_extend) (rtx, rtx);
16784 rtx (*gen_test_ccno_1) (rtx, rtx);
16786 switch (mode)
16788 case SImode:
16789 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
16790 gen_test_ccno_1 = gen_testsi_ccno_1;
16791 gen_zero_extend = gen_zero_extendqisi2;
16792 break;
16793 case DImode:
16794 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
16795 gen_test_ccno_1 = gen_testdi_ccno_1;
16796 gen_zero_extend = gen_zero_extendqidi2;
16797 break;
16798 default:
16799 gcc_unreachable ();
16802 end_label = gen_label_rtx ();
16803 qimode_label = gen_label_rtx ();
16805 scratch = gen_reg_rtx (mode);
16807 /* Use 8bit unsigned divimod if dividend and divisor are within
16808 the range [0-255]. */
16809 emit_move_insn (scratch, operands[2]);
16810 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
16811 scratch, 1, OPTAB_DIRECT);
16812 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
16813 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
16814 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
16815 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
16816 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
16817 pc_rtx);
16818 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
16819 predict_jump (REG_BR_PROB_BASE * 50 / 100);
16820 JUMP_LABEL (insn) = qimode_label;
16822 /* Generate original signed/unsigned divimod. */
16823 div = gen_divmod4_1 (operands[0], operands[1],
16824 operands[2], operands[3]);
16825 emit_insn (div);
16827 /* Branch to the end. */
16828 emit_jump_insn (gen_jump (end_label));
16829 emit_barrier ();
16831 /* Generate 8bit unsigned divide. */
16832 emit_label (qimode_label);
16833 /* Don't use operands[0] for result of 8bit divide since not all
16834 registers support QImode ZERO_EXTRACT. */
16835 tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
16836 tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
16837 tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
16838 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
16840 if (signed_p)
16842 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
16843 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
16845 else
16847 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
16848 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
16851 /* Extract remainder from AH. */
16852 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
16853 if (REG_P (operands[1]))
16854 insn = emit_move_insn (operands[1], tmp1);
16855 else
16857 /* Need a new scratch register since the old one has result
16858 of 8bit divide. */
16859 scratch = gen_reg_rtx (mode);
16860 emit_move_insn (scratch, tmp1);
16861 insn = emit_move_insn (operands[1], scratch);
16863 set_unique_reg_note (insn, REG_EQUAL, mod);
16865 /* Zero extend quotient from AL. */
16866 tmp1 = gen_lowpart (QImode, tmp0);
16867 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
16868 set_unique_reg_note (insn, REG_EQUAL, div);
16870 emit_label (end_label);
16873 #define LEA_MAX_STALL (3)
16874 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
16876 /* Increase given DISTANCE in half-cycles according to
16877 dependencies between PREV and NEXT instructions.
16878 Add 1 half-cycle if there is no dependency and
16879 go to next cycle if there is some dependecy. */
16881 static unsigned int
16882 increase_distance (rtx prev, rtx next, unsigned int distance)
16884 df_ref *use_rec;
16885 df_ref *def_rec;
16887 if (!prev || !next)
16888 return distance + (distance & 1) + 2;
16890 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
16891 return distance + 1;
16893 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
16894 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
16895 if (!DF_REF_IS_ARTIFICIAL (*def_rec)
16896 && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
16897 return distance + (distance & 1) + 2;
16899 return distance + 1;
16902 /* Function checks if instruction INSN defines register number
16903 REGNO1 or REGNO2. */
16905 static bool
16906 insn_defines_reg (unsigned int regno1, unsigned int regno2,
16907 rtx insn)
16909 df_ref *def_rec;
16911 for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
16912 if (DF_REF_REG_DEF_P (*def_rec)
16913 && !DF_REF_IS_ARTIFICIAL (*def_rec)
16914 && (regno1 == DF_REF_REGNO (*def_rec)
16915 || regno2 == DF_REF_REGNO (*def_rec)))
16917 return true;
16920 return false;
16923 /* Function checks if instruction INSN uses register number
16924 REGNO as a part of address expression. */
16926 static bool
16927 insn_uses_reg_mem (unsigned int regno, rtx insn)
16929 df_ref *use_rec;
16931 for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
16932 if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
16933 return true;
16935 return false;
16938 /* Search backward for non-agu definition of register number REGNO1
16939 or register number REGNO2 in basic block starting from instruction
16940 START up to head of basic block or instruction INSN.
16942 Function puts true value into *FOUND var if definition was found
16943 and false otherwise.
16945 Distance in half-cycles between START and found instruction or head
16946 of BB is added to DISTANCE and returned. */
16948 static int
16949 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
16950 rtx insn, int distance,
16951 rtx start, bool *found)
16953 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16954 rtx prev = start;
16955 rtx next = NULL;
16957 *found = false;
16959 while (prev
16960 && prev != insn
16961 && distance < LEA_SEARCH_THRESHOLD)
16963 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
16965 distance = increase_distance (prev, next, distance);
16966 if (insn_defines_reg (regno1, regno2, prev))
16968 if (recog_memoized (prev) < 0
16969 || get_attr_type (prev) != TYPE_LEA)
16971 *found = true;
16972 return distance;
16976 next = prev;
16978 if (prev == BB_HEAD (bb))
16979 break;
16981 prev = PREV_INSN (prev);
16984 return distance;
16987 /* Search backward for non-agu definition of register number REGNO1
16988 or register number REGNO2 in INSN's basic block until
16989 1. Pass LEA_SEARCH_THRESHOLD instructions, or
16990 2. Reach neighbour BBs boundary, or
16991 3. Reach agu definition.
16992 Returns the distance between the non-agu definition point and INSN.
16993 If no definition point, returns -1. */
16995 static int
16996 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
16997 rtx insn)
16999 basic_block bb = BLOCK_FOR_INSN (insn);
17000 int distance = 0;
17001 bool found = false;
17003 if (insn != BB_HEAD (bb))
17004 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
17005 distance, PREV_INSN (insn),
17006 &found);
17008 if (!found && distance < LEA_SEARCH_THRESHOLD)
17010 edge e;
17011 edge_iterator ei;
17012 bool simple_loop = false;
17014 FOR_EACH_EDGE (e, ei, bb->preds)
17015 if (e->src == bb)
17017 simple_loop = true;
17018 break;
17021 if (simple_loop)
17022 distance = distance_non_agu_define_in_bb (regno1, regno2,
17023 insn, distance,
17024 BB_END (bb), &found);
17025 else
17027 int shortest_dist = -1;
17028 bool found_in_bb = false;
17030 FOR_EACH_EDGE (e, ei, bb->preds)
17032 int bb_dist
17033 = distance_non_agu_define_in_bb (regno1, regno2,
17034 insn, distance,
17035 BB_END (e->src),
17036 &found_in_bb);
17037 if (found_in_bb)
17039 if (shortest_dist < 0)
17040 shortest_dist = bb_dist;
17041 else if (bb_dist > 0)
17042 shortest_dist = MIN (bb_dist, shortest_dist);
17044 found = true;
17048 distance = shortest_dist;
17052 /* get_attr_type may modify recog data. We want to make sure
17053 that recog data is valid for instruction INSN, on which
17054 distance_non_agu_define is called. INSN is unchanged here. */
17055 extract_insn_cached (insn);
17057 if (!found)
17058 return -1;
17060 return distance >> 1;
17063 /* Return the distance in half-cycles between INSN and the next
17064 insn that uses register number REGNO in memory address added
17065 to DISTANCE. Return -1 if REGNO0 is set.
17067 Put true value into *FOUND if register usage was found and
17068 false otherwise.
17069 Put true value into *REDEFINED if register redefinition was
17070 found and false otherwise. */
17072 static int
17073 distance_agu_use_in_bb (unsigned int regno,
17074 rtx insn, int distance, rtx start,
17075 bool *found, bool *redefined)
17077 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
17078 rtx next = start;
17079 rtx prev = NULL;
17081 *found = false;
17082 *redefined = false;
17084 while (next
17085 && next != insn
17086 && distance < LEA_SEARCH_THRESHOLD)
17088 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
17090 distance = increase_distance(prev, next, distance);
17091 if (insn_uses_reg_mem (regno, next))
17093 /* Return DISTANCE if OP0 is used in memory
17094 address in NEXT. */
17095 *found = true;
17096 return distance;
17099 if (insn_defines_reg (regno, INVALID_REGNUM, next))
17101 /* Return -1 if OP0 is set in NEXT. */
17102 *redefined = true;
17103 return -1;
17106 prev = next;
17109 if (next == BB_END (bb))
17110 break;
17112 next = NEXT_INSN (next);
17115 return distance;
17118 /* Return the distance between INSN and the next insn that uses
17119 register number REGNO0 in memory address. Return -1 if no such
17120 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
17122 static int
17123 distance_agu_use (unsigned int regno0, rtx insn)
17125 basic_block bb = BLOCK_FOR_INSN (insn);
17126 int distance = 0;
17127 bool found = false;
17128 bool redefined = false;
17130 if (insn != BB_END (bb))
17131 distance = distance_agu_use_in_bb (regno0, insn, distance,
17132 NEXT_INSN (insn),
17133 &found, &redefined);
17135 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
17137 edge e;
17138 edge_iterator ei;
17139 bool simple_loop = false;
17141 FOR_EACH_EDGE (e, ei, bb->succs)
17142 if (e->dest == bb)
17144 simple_loop = true;
17145 break;
17148 if (simple_loop)
17149 distance = distance_agu_use_in_bb (regno0, insn,
17150 distance, BB_HEAD (bb),
17151 &found, &redefined);
17152 else
17154 int shortest_dist = -1;
17155 bool found_in_bb = false;
17156 bool redefined_in_bb = false;
17158 FOR_EACH_EDGE (e, ei, bb->succs)
17160 int bb_dist
17161 = distance_agu_use_in_bb (regno0, insn,
17162 distance, BB_HEAD (e->dest),
17163 &found_in_bb, &redefined_in_bb);
17164 if (found_in_bb)
17166 if (shortest_dist < 0)
17167 shortest_dist = bb_dist;
17168 else if (bb_dist > 0)
17169 shortest_dist = MIN (bb_dist, shortest_dist);
17171 found = true;
17175 distance = shortest_dist;
17179 if (!found || redefined)
17180 return -1;
17182 return distance >> 1;
17185 /* Define this macro to tune LEA priority vs ADD, it take effect when
17186 there is a dilemma of choicing LEA or ADD
17187 Negative value: ADD is more preferred than LEA
17188 Zero: Netrual
17189 Positive value: LEA is more preferred than ADD*/
17190 #define IX86_LEA_PRIORITY 0
17192 /* Return true if usage of lea INSN has performance advantage
17193 over a sequence of instructions. Instructions sequence has
17194 SPLIT_COST cycles higher latency than lea latency. */
17196 static bool
17197 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
17198 unsigned int regno2, int split_cost)
17200 int dist_define, dist_use;
17202 dist_define = distance_non_agu_define (regno1, regno2, insn);
17203 dist_use = distance_agu_use (regno0, insn);
17205 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
17207 /* If there is no non AGU operand definition, no AGU
17208 operand usage and split cost is 0 then both lea
17209 and non lea variants have same priority. Currently
17210 we prefer lea for 64 bit code and non lea on 32 bit
17211 code. */
17212 if (dist_use < 0 && split_cost == 0)
17213 return TARGET_64BIT || IX86_LEA_PRIORITY;
17214 else
17215 return true;
17218 /* With longer definitions distance lea is more preferable.
17219 Here we change it to take into account splitting cost and
17220 lea priority. */
17221 dist_define += split_cost + IX86_LEA_PRIORITY;
17223 /* If there is no use in memory addess then we just check
17224 that split cost exceeds AGU stall. */
17225 if (dist_use < 0)
17226 return dist_define > LEA_MAX_STALL;
17228 /* If this insn has both backward non-agu dependence and forward
17229 agu dependence, the one with short distance takes effect. */
17230 return dist_define >= dist_use;
17233 /* Return true if it is legal to clobber flags by INSN and
17234 false otherwise. */
17236 static bool
17237 ix86_ok_to_clobber_flags (rtx insn)
17239 basic_block bb = BLOCK_FOR_INSN (insn);
17240 df_ref *use;
17241 bitmap live;
17243 while (insn)
17245 if (NONDEBUG_INSN_P (insn))
17247 for (use = DF_INSN_USES (insn); *use; use++)
17248 if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
17249 return false;
17251 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
17252 return true;
17255 if (insn == BB_END (bb))
17256 break;
17258 insn = NEXT_INSN (insn);
17261 live = df_get_live_out(bb);
17262 return !REGNO_REG_SET_P (live, FLAGS_REG);
17265 /* Return true if we need to split op0 = op1 + op2 into a sequence of
17266 move and add to avoid AGU stalls. */
17268 bool
17269 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
17271 unsigned int regno0, regno1, regno2;
17273 /* Check if we need to optimize. */
17274 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17275 return false;
17277 /* Check it is correct to split here. */
17278 if (!ix86_ok_to_clobber_flags(insn))
17279 return false;
17281 regno0 = true_regnum (operands[0]);
17282 regno1 = true_regnum (operands[1]);
17283 regno2 = true_regnum (operands[2]);
17285 /* We need to split only adds with non destructive
17286 destination operand. */
17287 if (regno0 == regno1 || regno0 == regno2)
17288 return false;
17289 else
17290 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1);
17293 /* Return true if we should emit lea instruction instead of mov
17294 instruction. */
17296 bool
17297 ix86_use_lea_for_mov (rtx insn, rtx operands[])
17299 unsigned int regno0, regno1;
17301 /* Check if we need to optimize. */
17302 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17303 return false;
17305 /* Use lea for reg to reg moves only. */
17306 if (!REG_P (operands[0]) || !REG_P (operands[1]))
17307 return false;
17309 regno0 = true_regnum (operands[0]);
17310 regno1 = true_regnum (operands[1]);
17312 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0);
17315 /* Return true if we need to split lea into a sequence of
17316 instructions to avoid AGU stalls. */
17318 bool
17319 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
17321 unsigned int regno0, regno1, regno2;
17322 int split_cost;
17323 struct ix86_address parts;
17324 int ok;
17326 /* Check we need to optimize. */
17327 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17328 return false;
17330 /* Check it is correct to split here. */
17331 if (!ix86_ok_to_clobber_flags(insn))
17332 return false;
17334 ok = ix86_decompose_address (operands[1], &parts);
17335 gcc_assert (ok);
17337 /* There should be at least two components in the address. */
17338 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
17339 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
17340 return false;
17342 /* We should not split into add if non legitimate pic
17343 operand is used as displacement. */
17344 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
17345 return false;
17347 regno0 = true_regnum (operands[0]) ;
17348 regno1 = INVALID_REGNUM;
17349 regno2 = INVALID_REGNUM;
17351 if (parts.base)
17352 regno1 = true_regnum (parts.base);
17353 if (parts.index)
17354 regno2 = true_regnum (parts.index);
17356 split_cost = 0;
17358 /* Compute how many cycles we will add to execution time
17359 if split lea into a sequence of instructions. */
17360 if (parts.base || parts.index)
17362 /* Have to use mov instruction if non desctructive
17363 destination form is used. */
17364 if (regno1 != regno0 && regno2 != regno0)
17365 split_cost += 1;
17367 /* Have to add index to base if both exist. */
17368 if (parts.base && parts.index)
17369 split_cost += 1;
17371 /* Have to use shift and adds if scale is 2 or greater. */
17372 if (parts.scale > 1)
17374 if (regno0 != regno1)
17375 split_cost += 1;
17376 else if (regno2 == regno0)
17377 split_cost += 4;
17378 else
17379 split_cost += parts.scale;
17382 /* Have to use add instruction with immediate if
17383 disp is non zero. */
17384 if (parts.disp && parts.disp != const0_rtx)
17385 split_cost += 1;
17387 /* Subtract the price of lea. */
17388 split_cost -= 1;
17391 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost);
17394 /* Emit x86 binary operand CODE in mode MODE, where the first operand
17395 matches destination. RTX includes clobber of FLAGS_REG. */
17397 static void
17398 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
17399 rtx dst, rtx src)
17401 rtx op, clob;
17403 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
17404 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17406 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
17409 /* Return true if regno1 def is nearest to the insn. */
17411 static bool
17412 find_nearest_reg_def (rtx insn, int regno1, int regno2)
17414 rtx prev = insn;
17415 rtx start = BB_HEAD (BLOCK_FOR_INSN (insn));
17417 if (insn == start)
17418 return false;
17419 while (prev && prev != start)
17421 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
17423 prev = PREV_INSN (prev);
17424 continue;
17426 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
17427 return true;
17428 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
17429 return false;
17430 prev = PREV_INSN (prev);
17433 /* None of the regs is defined in the bb. */
17434 return false;
17437 /* Split lea instructions into a sequence of instructions
17438 which are executed on ALU to avoid AGU stalls.
17439 It is assumed that it is allowed to clobber flags register
17440 at lea position. */
17442 void
17443 ix86_split_lea_for_addr (rtx insn, rtx operands[], enum machine_mode mode)
17445 unsigned int regno0, regno1, regno2;
17446 struct ix86_address parts;
17447 rtx target, tmp;
17448 int ok, adds;
17450 ok = ix86_decompose_address (operands[1], &parts);
17451 gcc_assert (ok);
17453 target = gen_lowpart (mode, operands[0]);
17455 regno0 = true_regnum (target);
17456 regno1 = INVALID_REGNUM;
17457 regno2 = INVALID_REGNUM;
17459 if (parts.base)
17461 parts.base = gen_lowpart (mode, parts.base);
17462 regno1 = true_regnum (parts.base);
17465 if (parts.index)
17467 parts.index = gen_lowpart (mode, parts.index);
17468 regno2 = true_regnum (parts.index);
17471 if (parts.disp)
17472 parts.disp = gen_lowpart (mode, parts.disp);
17474 if (parts.scale > 1)
17476 /* Case r1 = r1 + ... */
17477 if (regno1 == regno0)
17479 /* If we have a case r1 = r1 + C * r1 then we
17480 should use multiplication which is very
17481 expensive. Assume cost model is wrong if we
17482 have such case here. */
17483 gcc_assert (regno2 != regno0);
17485 for (adds = parts.scale; adds > 0; adds--)
17486 ix86_emit_binop (PLUS, mode, target, parts.index);
17488 else
17490 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
17491 if (regno0 != regno2)
17492 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
17494 /* Use shift for scaling. */
17495 ix86_emit_binop (ASHIFT, mode, target,
17496 GEN_INT (exact_log2 (parts.scale)));
17498 if (parts.base)
17499 ix86_emit_binop (PLUS, mode, target, parts.base);
17501 if (parts.disp && parts.disp != const0_rtx)
17502 ix86_emit_binop (PLUS, mode, target, parts.disp);
17505 else if (!parts.base && !parts.index)
17507 gcc_assert(parts.disp);
17508 emit_insn (gen_rtx_SET (VOIDmode, target, parts.disp));
17510 else
17512 if (!parts.base)
17514 if (regno0 != regno2)
17515 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
17517 else if (!parts.index)
17519 if (regno0 != regno1)
17520 emit_insn (gen_rtx_SET (VOIDmode, target, parts.base));
17522 else
17524 if (regno0 == regno1)
17525 tmp = parts.index;
17526 else if (regno0 == regno2)
17527 tmp = parts.base;
17528 else
17530 rtx tmp1;
17532 /* Find better operand for SET instruction, depending
17533 on which definition is farther from the insn. */
17534 if (find_nearest_reg_def (insn, regno1, regno2))
17535 tmp = parts.index, tmp1 = parts.base;
17536 else
17537 tmp = parts.base, tmp1 = parts.index;
17539 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
17541 if (parts.disp && parts.disp != const0_rtx)
17542 ix86_emit_binop (PLUS, mode, target, parts.disp);
17544 ix86_emit_binop (PLUS, mode, target, tmp1);
17545 return;
17548 ix86_emit_binop (PLUS, mode, target, tmp);
17551 if (parts.disp && parts.disp != const0_rtx)
17552 ix86_emit_binop (PLUS, mode, target, parts.disp);
17556 /* Return true if it is ok to optimize an ADD operation to LEA
17557 operation to avoid flag register consumation. For most processors,
17558 ADD is faster than LEA. For the processors like ATOM, if the
17559 destination register of LEA holds an actual address which will be
17560 used soon, LEA is better and otherwise ADD is better. */
17562 bool
17563 ix86_lea_for_add_ok (rtx insn, rtx operands[])
17565 unsigned int regno0 = true_regnum (operands[0]);
17566 unsigned int regno1 = true_regnum (operands[1]);
17567 unsigned int regno2 = true_regnum (operands[2]);
17569 /* If a = b + c, (a!=b && a!=c), must use lea form. */
17570 if (regno0 != regno1 && regno0 != regno2)
17571 return true;
17573 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17574 return false;
17576 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0);
17579 /* Return true if destination reg of SET_BODY is shift count of
17580 USE_BODY. */
17582 static bool
17583 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
17585 rtx set_dest;
17586 rtx shift_rtx;
17587 int i;
17589 /* Retrieve destination of SET_BODY. */
17590 switch (GET_CODE (set_body))
17592 case SET:
17593 set_dest = SET_DEST (set_body);
17594 if (!set_dest || !REG_P (set_dest))
17595 return false;
17596 break;
17597 case PARALLEL:
17598 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
17599 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
17600 use_body))
17601 return true;
17602 default:
17603 return false;
17604 break;
17607 /* Retrieve shift count of USE_BODY. */
17608 switch (GET_CODE (use_body))
17610 case SET:
17611 shift_rtx = XEXP (use_body, 1);
17612 break;
17613 case PARALLEL:
17614 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
17615 if (ix86_dep_by_shift_count_body (set_body,
17616 XVECEXP (use_body, 0, i)))
17617 return true;
17618 default:
17619 return false;
17620 break;
17623 if (shift_rtx
17624 && (GET_CODE (shift_rtx) == ASHIFT
17625 || GET_CODE (shift_rtx) == LSHIFTRT
17626 || GET_CODE (shift_rtx) == ASHIFTRT
17627 || GET_CODE (shift_rtx) == ROTATE
17628 || GET_CODE (shift_rtx) == ROTATERT))
17630 rtx shift_count = XEXP (shift_rtx, 1);
17632 /* Return true if shift count is dest of SET_BODY. */
17633 if (REG_P (shift_count))
17635 /* Add check since it can be invoked before register
17636 allocation in pre-reload schedule. */
17637 if (reload_completed
17638 && true_regnum (set_dest) == true_regnum (shift_count))
17639 return true;
17640 else if (REGNO(set_dest) == REGNO(shift_count))
17641 return true;
17645 return false;
17648 /* Return true if destination reg of SET_INSN is shift count of
17649 USE_INSN. */
17651 bool
17652 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
17654 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
17655 PATTERN (use_insn));
17658 /* Return TRUE or FALSE depending on whether the unary operator meets the
17659 appropriate constraints. */
17661 bool
17662 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
17663 enum machine_mode mode ATTRIBUTE_UNUSED,
17664 rtx operands[2] ATTRIBUTE_UNUSED)
17666 /* If one of operands is memory, source and destination must match. */
17667 if ((MEM_P (operands[0])
17668 || MEM_P (operands[1]))
17669 && ! rtx_equal_p (operands[0], operands[1]))
17670 return false;
17671 return true;
17674 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
17675 are ok, keeping in mind the possible movddup alternative. */
17677 bool
17678 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
17680 if (MEM_P (operands[0]))
17681 return rtx_equal_p (operands[0], operands[1 + high]);
17682 if (MEM_P (operands[1]) && MEM_P (operands[2]))
17683 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
17684 return true;
17687 /* Post-reload splitter for converting an SF or DFmode value in an
17688 SSE register into an unsigned SImode. */
17690 void
17691 ix86_split_convert_uns_si_sse (rtx operands[])
17693 enum machine_mode vecmode;
17694 rtx value, large, zero_or_two31, input, two31, x;
17696 large = operands[1];
17697 zero_or_two31 = operands[2];
17698 input = operands[3];
17699 two31 = operands[4];
17700 vecmode = GET_MODE (large);
17701 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
17703 /* Load up the value into the low element. We must ensure that the other
17704 elements are valid floats -- zero is the easiest such value. */
17705 if (MEM_P (input))
17707 if (vecmode == V4SFmode)
17708 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
17709 else
17710 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
17712 else
17714 input = gen_rtx_REG (vecmode, REGNO (input));
17715 emit_move_insn (value, CONST0_RTX (vecmode));
17716 if (vecmode == V4SFmode)
17717 emit_insn (gen_sse_movss (value, value, input));
17718 else
17719 emit_insn (gen_sse2_movsd (value, value, input));
17722 emit_move_insn (large, two31);
17723 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
17725 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
17726 emit_insn (gen_rtx_SET (VOIDmode, large, x));
17728 x = gen_rtx_AND (vecmode, zero_or_two31, large);
17729 emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
17731 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
17732 emit_insn (gen_rtx_SET (VOIDmode, value, x));
17734 large = gen_rtx_REG (V4SImode, REGNO (large));
17735 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
17737 x = gen_rtx_REG (V4SImode, REGNO (value));
17738 if (vecmode == V4SFmode)
17739 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
17740 else
17741 emit_insn (gen_sse2_cvttpd2dq (x, value));
17742 value = x;
17744 emit_insn (gen_xorv4si3 (value, value, large));
17747 /* Convert an unsigned DImode value into a DFmode, using only SSE.
17748 Expects the 64-bit DImode to be supplied in a pair of integral
17749 registers. Requires SSE2; will use SSE3 if available. For x86_32,
17750 -mfpmath=sse, !optimize_size only. */
17752 void
17753 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
17755 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
17756 rtx int_xmm, fp_xmm;
17757 rtx biases, exponents;
17758 rtx x;
17760 int_xmm = gen_reg_rtx (V4SImode);
17761 if (TARGET_INTER_UNIT_MOVES)
17762 emit_insn (gen_movdi_to_sse (int_xmm, input));
17763 else if (TARGET_SSE_SPLIT_REGS)
17765 emit_clobber (int_xmm);
17766 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
17768 else
17770 x = gen_reg_rtx (V2DImode);
17771 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
17772 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
17775 x = gen_rtx_CONST_VECTOR (V4SImode,
17776 gen_rtvec (4, GEN_INT (0x43300000UL),
17777 GEN_INT (0x45300000UL),
17778 const0_rtx, const0_rtx));
17779 exponents = validize_mem (force_const_mem (V4SImode, x));
17781 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
17782 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
17784 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
17785 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
17786 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
17787 (0x1.0p84 + double(fp_value_hi_xmm)).
17788 Note these exponents differ by 32. */
17790 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
17792 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
17793 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
17794 real_ldexp (&bias_lo_rvt, &dconst1, 52);
17795 real_ldexp (&bias_hi_rvt, &dconst1, 84);
17796 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
17797 x = const_double_from_real_value (bias_hi_rvt, DFmode);
17798 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
17799 biases = validize_mem (force_const_mem (V2DFmode, biases));
17800 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
17802 /* Add the upper and lower DFmode values together. */
17803 if (TARGET_SSE3)
17804 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
17805 else
17807 x = copy_to_mode_reg (V2DFmode, fp_xmm);
17808 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
17809 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
17812 ix86_expand_vector_extract (false, target, fp_xmm, 0);
17815 /* Not used, but eases macroization of patterns. */
17816 void
17817 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
17818 rtx input ATTRIBUTE_UNUSED)
17820 gcc_unreachable ();
17823 /* Convert an unsigned SImode value into a DFmode. Only currently used
17824 for SSE, but applicable anywhere. */
17826 void
17827 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
17829 REAL_VALUE_TYPE TWO31r;
17830 rtx x, fp;
17832 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
17833 NULL, 1, OPTAB_DIRECT);
17835 fp = gen_reg_rtx (DFmode);
17836 emit_insn (gen_floatsidf2 (fp, x));
17838 real_ldexp (&TWO31r, &dconst1, 31);
17839 x = const_double_from_real_value (TWO31r, DFmode);
17841 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
17842 if (x != target)
17843 emit_move_insn (target, x);
17846 /* Convert a signed DImode value into a DFmode. Only used for SSE in
17847 32-bit mode; otherwise we have a direct convert instruction. */
17849 void
17850 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
17852 REAL_VALUE_TYPE TWO32r;
17853 rtx fp_lo, fp_hi, x;
17855 fp_lo = gen_reg_rtx (DFmode);
17856 fp_hi = gen_reg_rtx (DFmode);
17858 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
17860 real_ldexp (&TWO32r, &dconst1, 32);
17861 x = const_double_from_real_value (TWO32r, DFmode);
17862 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
17864 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
17866 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
17867 0, OPTAB_DIRECT);
17868 if (x != target)
17869 emit_move_insn (target, x);
17872 /* Convert an unsigned SImode value into a SFmode, using only SSE.
17873 For x86_32, -mfpmath=sse, !optimize_size only. */
17874 void
17875 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
17877 REAL_VALUE_TYPE ONE16r;
17878 rtx fp_hi, fp_lo, int_hi, int_lo, x;
17880 real_ldexp (&ONE16r, &dconst1, 16);
17881 x = const_double_from_real_value (ONE16r, SFmode);
17882 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
17883 NULL, 0, OPTAB_DIRECT);
17884 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
17885 NULL, 0, OPTAB_DIRECT);
17886 fp_hi = gen_reg_rtx (SFmode);
17887 fp_lo = gen_reg_rtx (SFmode);
17888 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
17889 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
17890 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
17891 0, OPTAB_DIRECT);
17892 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
17893 0, OPTAB_DIRECT);
17894 if (!rtx_equal_p (target, fp_hi))
17895 emit_move_insn (target, fp_hi);
17898 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
17899 a vector of unsigned ints VAL to vector of floats TARGET. */
17901 void
17902 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
17904 rtx tmp[8];
17905 REAL_VALUE_TYPE TWO16r;
17906 enum machine_mode intmode = GET_MODE (val);
17907 enum machine_mode fltmode = GET_MODE (target);
17908 rtx (*cvt) (rtx, rtx);
17910 if (intmode == V4SImode)
17911 cvt = gen_floatv4siv4sf2;
17912 else
17913 cvt = gen_floatv8siv8sf2;
17914 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
17915 tmp[0] = force_reg (intmode, tmp[0]);
17916 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
17917 OPTAB_DIRECT);
17918 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
17919 NULL_RTX, 1, OPTAB_DIRECT);
17920 tmp[3] = gen_reg_rtx (fltmode);
17921 emit_insn (cvt (tmp[3], tmp[1]));
17922 tmp[4] = gen_reg_rtx (fltmode);
17923 emit_insn (cvt (tmp[4], tmp[2]));
17924 real_ldexp (&TWO16r, &dconst1, 16);
17925 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
17926 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
17927 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
17928 OPTAB_DIRECT);
17929 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
17930 OPTAB_DIRECT);
17931 if (tmp[7] != target)
17932 emit_move_insn (target, tmp[7]);
17935 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
17936 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
17937 This is done by doing just signed conversion if < 0x1p31, and otherwise by
17938 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
17941 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
17943 REAL_VALUE_TYPE TWO31r;
17944 rtx two31r, tmp[4];
17945 enum machine_mode mode = GET_MODE (val);
17946 enum machine_mode scalarmode = GET_MODE_INNER (mode);
17947 enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
17948 rtx (*cmp) (rtx, rtx, rtx, rtx);
17949 int i;
17951 for (i = 0; i < 3; i++)
17952 tmp[i] = gen_reg_rtx (mode);
17953 real_ldexp (&TWO31r, &dconst1, 31);
17954 two31r = const_double_from_real_value (TWO31r, scalarmode);
17955 two31r = ix86_build_const_vector (mode, 1, two31r);
17956 two31r = force_reg (mode, two31r);
17957 switch (mode)
17959 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
17960 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
17961 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
17962 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
17963 default: gcc_unreachable ();
17965 tmp[3] = gen_rtx_LE (mode, two31r, val);
17966 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
17967 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
17968 0, OPTAB_DIRECT);
17969 if (intmode == V4SImode || TARGET_AVX2)
17970 *xorp = expand_simple_binop (intmode, ASHIFT,
17971 gen_lowpart (intmode, tmp[0]),
17972 GEN_INT (31), NULL_RTX, 0,
17973 OPTAB_DIRECT);
17974 else
17976 rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
17977 two31 = ix86_build_const_vector (intmode, 1, two31);
17978 *xorp = expand_simple_binop (intmode, AND,
17979 gen_lowpart (intmode, tmp[0]),
17980 two31, NULL_RTX, 0,
17981 OPTAB_DIRECT);
17983 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
17984 0, OPTAB_DIRECT);
17987 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
17988 then replicate the value for all elements of the vector
17989 register. */
17992 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
17994 int i, n_elt;
17995 rtvec v;
17996 enum machine_mode scalar_mode;
17998 switch (mode)
18000 case V32QImode:
18001 case V16QImode:
18002 case V16HImode:
18003 case V8HImode:
18004 case V8SImode:
18005 case V4SImode:
18006 case V4DImode:
18007 case V2DImode:
18008 gcc_assert (vect);
18009 case V8SFmode:
18010 case V4SFmode:
18011 case V4DFmode:
18012 case V2DFmode:
18013 n_elt = GET_MODE_NUNITS (mode);
18014 v = rtvec_alloc (n_elt);
18015 scalar_mode = GET_MODE_INNER (mode);
18017 RTVEC_ELT (v, 0) = value;
18019 for (i = 1; i < n_elt; ++i)
18020 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
18022 return gen_rtx_CONST_VECTOR (mode, v);
18024 default:
18025 gcc_unreachable ();
18029 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
18030 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
18031 for an SSE register. If VECT is true, then replicate the mask for
18032 all elements of the vector register. If INVERT is true, then create
18033 a mask excluding the sign bit. */
18036 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
18038 enum machine_mode vec_mode, imode;
18039 HOST_WIDE_INT hi, lo;
18040 int shift = 63;
18041 rtx v;
18042 rtx mask;
18044 /* Find the sign bit, sign extended to 2*HWI. */
18045 switch (mode)
18047 case V8SImode:
18048 case V4SImode:
18049 case V8SFmode:
18050 case V4SFmode:
18051 vec_mode = mode;
18052 mode = GET_MODE_INNER (mode);
18053 imode = SImode;
18054 lo = 0x80000000, hi = lo < 0;
18055 break;
18057 case V4DImode:
18058 case V2DImode:
18059 case V4DFmode:
18060 case V2DFmode:
18061 vec_mode = mode;
18062 mode = GET_MODE_INNER (mode);
18063 imode = DImode;
18064 if (HOST_BITS_PER_WIDE_INT >= 64)
18065 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
18066 else
18067 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18068 break;
18070 case TImode:
18071 case TFmode:
18072 vec_mode = VOIDmode;
18073 if (HOST_BITS_PER_WIDE_INT >= 64)
18075 imode = TImode;
18076 lo = 0, hi = (HOST_WIDE_INT)1 << shift;
18078 else
18080 rtvec vec;
18082 imode = DImode;
18083 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18085 if (invert)
18087 lo = ~lo, hi = ~hi;
18088 v = constm1_rtx;
18090 else
18091 v = const0_rtx;
18093 mask = immed_double_const (lo, hi, imode);
18095 vec = gen_rtvec (2, v, mask);
18096 v = gen_rtx_CONST_VECTOR (V2DImode, vec);
18097 v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
18099 return v;
18101 break;
18103 default:
18104 gcc_unreachable ();
18107 if (invert)
18108 lo = ~lo, hi = ~hi;
18110 /* Force this value into the low part of a fp vector constant. */
18111 mask = immed_double_const (lo, hi, imode);
18112 mask = gen_lowpart (mode, mask);
18114 if (vec_mode == VOIDmode)
18115 return force_reg (mode, mask);
18117 v = ix86_build_const_vector (vec_mode, vect, mask);
18118 return force_reg (vec_mode, v);
18121 /* Generate code for floating point ABS or NEG. */
18123 void
18124 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
18125 rtx operands[])
18127 rtx mask, set, dst, src;
18128 bool use_sse = false;
18129 bool vector_mode = VECTOR_MODE_P (mode);
18130 enum machine_mode vmode = mode;
18132 if (vector_mode)
18133 use_sse = true;
18134 else if (mode == TFmode)
18135 use_sse = true;
18136 else if (TARGET_SSE_MATH)
18138 use_sse = SSE_FLOAT_MODE_P (mode);
18139 if (mode == SFmode)
18140 vmode = V4SFmode;
18141 else if (mode == DFmode)
18142 vmode = V2DFmode;
18145 /* NEG and ABS performed with SSE use bitwise mask operations.
18146 Create the appropriate mask now. */
18147 if (use_sse)
18148 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
18149 else
18150 mask = NULL_RTX;
18152 dst = operands[0];
18153 src = operands[1];
18155 set = gen_rtx_fmt_e (code, mode, src);
18156 set = gen_rtx_SET (VOIDmode, dst, set);
18158 if (mask)
18160 rtx use, clob;
18161 rtvec par;
18163 use = gen_rtx_USE (VOIDmode, mask);
18164 if (vector_mode)
18165 par = gen_rtvec (2, set, use);
18166 else
18168 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
18169 par = gen_rtvec (3, set, use, clob);
18171 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
18173 else
18174 emit_insn (set);
18177 /* Expand a copysign operation. Special case operand 0 being a constant. */
18179 void
18180 ix86_expand_copysign (rtx operands[])
18182 enum machine_mode mode, vmode;
18183 rtx dest, op0, op1, mask, nmask;
18185 dest = operands[0];
18186 op0 = operands[1];
18187 op1 = operands[2];
18189 mode = GET_MODE (dest);
18191 if (mode == SFmode)
18192 vmode = V4SFmode;
18193 else if (mode == DFmode)
18194 vmode = V2DFmode;
18195 else
18196 vmode = mode;
18198 if (GET_CODE (op0) == CONST_DOUBLE)
18200 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
18202 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
18203 op0 = simplify_unary_operation (ABS, mode, op0, mode);
18205 if (mode == SFmode || mode == DFmode)
18207 if (op0 == CONST0_RTX (mode))
18208 op0 = CONST0_RTX (vmode);
18209 else
18211 rtx v = ix86_build_const_vector (vmode, false, op0);
18213 op0 = force_reg (vmode, v);
18216 else if (op0 != CONST0_RTX (mode))
18217 op0 = force_reg (mode, op0);
18219 mask = ix86_build_signbit_mask (vmode, 0, 0);
18221 if (mode == SFmode)
18222 copysign_insn = gen_copysignsf3_const;
18223 else if (mode == DFmode)
18224 copysign_insn = gen_copysigndf3_const;
18225 else
18226 copysign_insn = gen_copysigntf3_const;
18228 emit_insn (copysign_insn (dest, op0, op1, mask));
18230 else
18232 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
18234 nmask = ix86_build_signbit_mask (vmode, 0, 1);
18235 mask = ix86_build_signbit_mask (vmode, 0, 0);
18237 if (mode == SFmode)
18238 copysign_insn = gen_copysignsf3_var;
18239 else if (mode == DFmode)
18240 copysign_insn = gen_copysigndf3_var;
18241 else
18242 copysign_insn = gen_copysigntf3_var;
18244 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
18248 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
18249 be a constant, and so has already been expanded into a vector constant. */
18251 void
18252 ix86_split_copysign_const (rtx operands[])
18254 enum machine_mode mode, vmode;
18255 rtx dest, op0, mask, x;
18257 dest = operands[0];
18258 op0 = operands[1];
18259 mask = operands[3];
18261 mode = GET_MODE (dest);
18262 vmode = GET_MODE (mask);
18264 dest = simplify_gen_subreg (vmode, dest, mode, 0);
18265 x = gen_rtx_AND (vmode, dest, mask);
18266 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18268 if (op0 != CONST0_RTX (vmode))
18270 x = gen_rtx_IOR (vmode, dest, op0);
18271 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18275 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
18276 so we have to do two masks. */
18278 void
18279 ix86_split_copysign_var (rtx operands[])
18281 enum machine_mode mode, vmode;
18282 rtx dest, scratch, op0, op1, mask, nmask, x;
18284 dest = operands[0];
18285 scratch = operands[1];
18286 op0 = operands[2];
18287 op1 = operands[3];
18288 nmask = operands[4];
18289 mask = operands[5];
18291 mode = GET_MODE (dest);
18292 vmode = GET_MODE (mask);
18294 if (rtx_equal_p (op0, op1))
18296 /* Shouldn't happen often (it's useless, obviously), but when it does
18297 we'd generate incorrect code if we continue below. */
18298 emit_move_insn (dest, op0);
18299 return;
18302 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
18304 gcc_assert (REGNO (op1) == REGNO (scratch));
18306 x = gen_rtx_AND (vmode, scratch, mask);
18307 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18309 dest = mask;
18310 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18311 x = gen_rtx_NOT (vmode, dest);
18312 x = gen_rtx_AND (vmode, x, op0);
18313 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18315 else
18317 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
18319 x = gen_rtx_AND (vmode, scratch, mask);
18321 else /* alternative 2,4 */
18323 gcc_assert (REGNO (mask) == REGNO (scratch));
18324 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
18325 x = gen_rtx_AND (vmode, scratch, op1);
18327 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18329 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
18331 dest = simplify_gen_subreg (vmode, op0, mode, 0);
18332 x = gen_rtx_AND (vmode, dest, nmask);
18334 else /* alternative 3,4 */
18336 gcc_assert (REGNO (nmask) == REGNO (dest));
18337 dest = nmask;
18338 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18339 x = gen_rtx_AND (vmode, dest, op0);
18341 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18344 x = gen_rtx_IOR (vmode, dest, scratch);
18345 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18348 /* Return TRUE or FALSE depending on whether the first SET in INSN
18349 has source and destination with matching CC modes, and that the
18350 CC mode is at least as constrained as REQ_MODE. */
18352 bool
18353 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
18355 rtx set;
18356 enum machine_mode set_mode;
18358 set = PATTERN (insn);
18359 if (GET_CODE (set) == PARALLEL)
18360 set = XVECEXP (set, 0, 0);
18361 gcc_assert (GET_CODE (set) == SET);
18362 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
18364 set_mode = GET_MODE (SET_DEST (set));
18365 switch (set_mode)
18367 case CCNOmode:
18368 if (req_mode != CCNOmode
18369 && (req_mode != CCmode
18370 || XEXP (SET_SRC (set), 1) != const0_rtx))
18371 return false;
18372 break;
18373 case CCmode:
18374 if (req_mode == CCGCmode)
18375 return false;
18376 /* FALLTHRU */
18377 case CCGCmode:
18378 if (req_mode == CCGOCmode || req_mode == CCNOmode)
18379 return false;
18380 /* FALLTHRU */
18381 case CCGOCmode:
18382 if (req_mode == CCZmode)
18383 return false;
18384 /* FALLTHRU */
18385 case CCZmode:
18386 break;
18388 case CCAmode:
18389 case CCCmode:
18390 case CCOmode:
18391 case CCSmode:
18392 if (set_mode != req_mode)
18393 return false;
18394 break;
18396 default:
18397 gcc_unreachable ();
18400 return GET_MODE (SET_SRC (set)) == set_mode;
18403 /* Generate insn patterns to do an integer compare of OPERANDS. */
18405 static rtx
18406 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
18408 enum machine_mode cmpmode;
18409 rtx tmp, flags;
18411 cmpmode = SELECT_CC_MODE (code, op0, op1);
18412 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
18414 /* This is very simple, but making the interface the same as in the
18415 FP case makes the rest of the code easier. */
18416 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
18417 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
18419 /* Return the test that should be put into the flags user, i.e.
18420 the bcc, scc, or cmov instruction. */
18421 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
18424 /* Figure out whether to use ordered or unordered fp comparisons.
18425 Return the appropriate mode to use. */
18427 enum machine_mode
18428 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
18430 /* ??? In order to make all comparisons reversible, we do all comparisons
18431 non-trapping when compiling for IEEE. Once gcc is able to distinguish
18432 all forms trapping and nontrapping comparisons, we can make inequality
18433 comparisons trapping again, since it results in better code when using
18434 FCOM based compares. */
18435 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
18438 enum machine_mode
18439 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
18441 enum machine_mode mode = GET_MODE (op0);
18443 if (SCALAR_FLOAT_MODE_P (mode))
18445 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18446 return ix86_fp_compare_mode (code);
18449 switch (code)
18451 /* Only zero flag is needed. */
18452 case EQ: /* ZF=0 */
18453 case NE: /* ZF!=0 */
18454 return CCZmode;
18455 /* Codes needing carry flag. */
18456 case GEU: /* CF=0 */
18457 case LTU: /* CF=1 */
18458 /* Detect overflow checks. They need just the carry flag. */
18459 if (GET_CODE (op0) == PLUS
18460 && rtx_equal_p (op1, XEXP (op0, 0)))
18461 return CCCmode;
18462 else
18463 return CCmode;
18464 case GTU: /* CF=0 & ZF=0 */
18465 case LEU: /* CF=1 | ZF=1 */
18466 /* Detect overflow checks. They need just the carry flag. */
18467 if (GET_CODE (op0) == MINUS
18468 && rtx_equal_p (op1, XEXP (op0, 0)))
18469 return CCCmode;
18470 else
18471 return CCmode;
18472 /* Codes possibly doable only with sign flag when
18473 comparing against zero. */
18474 case GE: /* SF=OF or SF=0 */
18475 case LT: /* SF<>OF or SF=1 */
18476 if (op1 == const0_rtx)
18477 return CCGOCmode;
18478 else
18479 /* For other cases Carry flag is not required. */
18480 return CCGCmode;
18481 /* Codes doable only with sign flag when comparing
18482 against zero, but we miss jump instruction for it
18483 so we need to use relational tests against overflow
18484 that thus needs to be zero. */
18485 case GT: /* ZF=0 & SF=OF */
18486 case LE: /* ZF=1 | SF<>OF */
18487 if (op1 == const0_rtx)
18488 return CCNOmode;
18489 else
18490 return CCGCmode;
18491 /* strcmp pattern do (use flags) and combine may ask us for proper
18492 mode. */
18493 case USE:
18494 return CCmode;
18495 default:
18496 gcc_unreachable ();
18500 /* Return the fixed registers used for condition codes. */
18502 static bool
18503 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
18505 *p1 = FLAGS_REG;
18506 *p2 = FPSR_REG;
18507 return true;
18510 /* If two condition code modes are compatible, return a condition code
18511 mode which is compatible with both. Otherwise, return
18512 VOIDmode. */
18514 static enum machine_mode
18515 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
18517 if (m1 == m2)
18518 return m1;
18520 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
18521 return VOIDmode;
18523 if ((m1 == CCGCmode && m2 == CCGOCmode)
18524 || (m1 == CCGOCmode && m2 == CCGCmode))
18525 return CCGCmode;
18527 if (m1 == CCZmode && (m2 == CCGCmode || m2 == CCGOCmode))
18528 return m2;
18529 else if (m2 == CCZmode && (m1 == CCGCmode || m1 == CCGOCmode))
18530 return m1;
18532 switch (m1)
18534 default:
18535 gcc_unreachable ();
18537 case CCmode:
18538 case CCGCmode:
18539 case CCGOCmode:
18540 case CCNOmode:
18541 case CCAmode:
18542 case CCCmode:
18543 case CCOmode:
18544 case CCSmode:
18545 case CCZmode:
18546 switch (m2)
18548 default:
18549 return VOIDmode;
18551 case CCmode:
18552 case CCGCmode:
18553 case CCGOCmode:
18554 case CCNOmode:
18555 case CCAmode:
18556 case CCCmode:
18557 case CCOmode:
18558 case CCSmode:
18559 case CCZmode:
18560 return CCmode;
18563 case CCFPmode:
18564 case CCFPUmode:
18565 /* These are only compatible with themselves, which we already
18566 checked above. */
18567 return VOIDmode;
18572 /* Return a comparison we can do and that it is equivalent to
18573 swap_condition (code) apart possibly from orderedness.
18574 But, never change orderedness if TARGET_IEEE_FP, returning
18575 UNKNOWN in that case if necessary. */
18577 static enum rtx_code
18578 ix86_fp_swap_condition (enum rtx_code code)
18580 switch (code)
18582 case GT: /* GTU - CF=0 & ZF=0 */
18583 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
18584 case GE: /* GEU - CF=0 */
18585 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
18586 case UNLT: /* LTU - CF=1 */
18587 return TARGET_IEEE_FP ? UNKNOWN : GT;
18588 case UNLE: /* LEU - CF=1 | ZF=1 */
18589 return TARGET_IEEE_FP ? UNKNOWN : GE;
18590 default:
18591 return swap_condition (code);
18595 /* Return cost of comparison CODE using the best strategy for performance.
18596 All following functions do use number of instructions as a cost metrics.
18597 In future this should be tweaked to compute bytes for optimize_size and
18598 take into account performance of various instructions on various CPUs. */
18600 static int
18601 ix86_fp_comparison_cost (enum rtx_code code)
18603 int arith_cost;
18605 /* The cost of code using bit-twiddling on %ah. */
18606 switch (code)
18608 case UNLE:
18609 case UNLT:
18610 case LTGT:
18611 case GT:
18612 case GE:
18613 case UNORDERED:
18614 case ORDERED:
18615 case UNEQ:
18616 arith_cost = 4;
18617 break;
18618 case LT:
18619 case NE:
18620 case EQ:
18621 case UNGE:
18622 arith_cost = TARGET_IEEE_FP ? 5 : 4;
18623 break;
18624 case LE:
18625 case UNGT:
18626 arith_cost = TARGET_IEEE_FP ? 6 : 4;
18627 break;
18628 default:
18629 gcc_unreachable ();
18632 switch (ix86_fp_comparison_strategy (code))
18634 case IX86_FPCMP_COMI:
18635 return arith_cost > 4 ? 3 : 2;
18636 case IX86_FPCMP_SAHF:
18637 return arith_cost > 4 ? 4 : 3;
18638 default:
18639 return arith_cost;
18643 /* Return strategy to use for floating-point. We assume that fcomi is always
18644 preferrable where available, since that is also true when looking at size
18645 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
18647 enum ix86_fpcmp_strategy
18648 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
18650 /* Do fcomi/sahf based test when profitable. */
18652 if (TARGET_CMOVE)
18653 return IX86_FPCMP_COMI;
18655 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_function_for_size_p (cfun)))
18656 return IX86_FPCMP_SAHF;
18658 return IX86_FPCMP_ARITH;
18661 /* Swap, force into registers, or otherwise massage the two operands
18662 to a fp comparison. The operands are updated in place; the new
18663 comparison code is returned. */
18665 static enum rtx_code
18666 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
18668 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
18669 rtx op0 = *pop0, op1 = *pop1;
18670 enum machine_mode op_mode = GET_MODE (op0);
18671 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
18673 /* All of the unordered compare instructions only work on registers.
18674 The same is true of the fcomi compare instructions. The XFmode
18675 compare instructions require registers except when comparing
18676 against zero or when converting operand 1 from fixed point to
18677 floating point. */
18679 if (!is_sse
18680 && (fpcmp_mode == CCFPUmode
18681 || (op_mode == XFmode
18682 && ! (standard_80387_constant_p (op0) == 1
18683 || standard_80387_constant_p (op1) == 1)
18684 && GET_CODE (op1) != FLOAT)
18685 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
18687 op0 = force_reg (op_mode, op0);
18688 op1 = force_reg (op_mode, op1);
18690 else
18692 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
18693 things around if they appear profitable, otherwise force op0
18694 into a register. */
18696 if (standard_80387_constant_p (op0) == 0
18697 || (MEM_P (op0)
18698 && ! (standard_80387_constant_p (op1) == 0
18699 || MEM_P (op1))))
18701 enum rtx_code new_code = ix86_fp_swap_condition (code);
18702 if (new_code != UNKNOWN)
18704 rtx tmp;
18705 tmp = op0, op0 = op1, op1 = tmp;
18706 code = new_code;
18710 if (!REG_P (op0))
18711 op0 = force_reg (op_mode, op0);
18713 if (CONSTANT_P (op1))
18715 int tmp = standard_80387_constant_p (op1);
18716 if (tmp == 0)
18717 op1 = validize_mem (force_const_mem (op_mode, op1));
18718 else if (tmp == 1)
18720 if (TARGET_CMOVE)
18721 op1 = force_reg (op_mode, op1);
18723 else
18724 op1 = force_reg (op_mode, op1);
18728 /* Try to rearrange the comparison to make it cheaper. */
18729 if (ix86_fp_comparison_cost (code)
18730 > ix86_fp_comparison_cost (swap_condition (code))
18731 && (REG_P (op1) || can_create_pseudo_p ()))
18733 rtx tmp;
18734 tmp = op0, op0 = op1, op1 = tmp;
18735 code = swap_condition (code);
18736 if (!REG_P (op0))
18737 op0 = force_reg (op_mode, op0);
18740 *pop0 = op0;
18741 *pop1 = op1;
18742 return code;
18745 /* Convert comparison codes we use to represent FP comparison to integer
18746 code that will result in proper branch. Return UNKNOWN if no such code
18747 is available. */
18749 enum rtx_code
18750 ix86_fp_compare_code_to_integer (enum rtx_code code)
18752 switch (code)
18754 case GT:
18755 return GTU;
18756 case GE:
18757 return GEU;
18758 case ORDERED:
18759 case UNORDERED:
18760 return code;
18761 break;
18762 case UNEQ:
18763 return EQ;
18764 break;
18765 case UNLT:
18766 return LTU;
18767 break;
18768 case UNLE:
18769 return LEU;
18770 break;
18771 case LTGT:
18772 return NE;
18773 break;
18774 default:
18775 return UNKNOWN;
18779 /* Generate insn patterns to do a floating point compare of OPERANDS. */
18781 static rtx
18782 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
18784 enum machine_mode fpcmp_mode, intcmp_mode;
18785 rtx tmp, tmp2;
18787 fpcmp_mode = ix86_fp_compare_mode (code);
18788 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
18790 /* Do fcomi/sahf based test when profitable. */
18791 switch (ix86_fp_comparison_strategy (code))
18793 case IX86_FPCMP_COMI:
18794 intcmp_mode = fpcmp_mode;
18795 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18796 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18797 tmp);
18798 emit_insn (tmp);
18799 break;
18801 case IX86_FPCMP_SAHF:
18802 intcmp_mode = fpcmp_mode;
18803 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18804 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18805 tmp);
18807 if (!scratch)
18808 scratch = gen_reg_rtx (HImode);
18809 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
18810 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
18811 break;
18813 case IX86_FPCMP_ARITH:
18814 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
18815 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18816 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
18817 if (!scratch)
18818 scratch = gen_reg_rtx (HImode);
18819 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
18821 /* In the unordered case, we have to check C2 for NaN's, which
18822 doesn't happen to work out to anything nice combination-wise.
18823 So do some bit twiddling on the value we've got in AH to come
18824 up with an appropriate set of condition codes. */
18826 intcmp_mode = CCNOmode;
18827 switch (code)
18829 case GT:
18830 case UNGT:
18831 if (code == GT || !TARGET_IEEE_FP)
18833 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18834 code = EQ;
18836 else
18838 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18839 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18840 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
18841 intcmp_mode = CCmode;
18842 code = GEU;
18844 break;
18845 case LT:
18846 case UNLT:
18847 if (code == LT && TARGET_IEEE_FP)
18849 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18850 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
18851 intcmp_mode = CCmode;
18852 code = EQ;
18854 else
18856 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
18857 code = NE;
18859 break;
18860 case GE:
18861 case UNGE:
18862 if (code == GE || !TARGET_IEEE_FP)
18864 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
18865 code = EQ;
18867 else
18869 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18870 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
18871 code = NE;
18873 break;
18874 case LE:
18875 case UNLE:
18876 if (code == LE && TARGET_IEEE_FP)
18878 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18879 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18880 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18881 intcmp_mode = CCmode;
18882 code = LTU;
18884 else
18886 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18887 code = NE;
18889 break;
18890 case EQ:
18891 case UNEQ:
18892 if (code == EQ && TARGET_IEEE_FP)
18894 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18895 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18896 intcmp_mode = CCmode;
18897 code = EQ;
18899 else
18901 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18902 code = NE;
18904 break;
18905 case NE:
18906 case LTGT:
18907 if (code == NE && TARGET_IEEE_FP)
18909 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18910 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
18911 GEN_INT (0x40)));
18912 code = NE;
18914 else
18916 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18917 code = EQ;
18919 break;
18921 case UNORDERED:
18922 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18923 code = NE;
18924 break;
18925 case ORDERED:
18926 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18927 code = EQ;
18928 break;
18930 default:
18931 gcc_unreachable ();
18933 break;
18935 default:
18936 gcc_unreachable();
18939 /* Return the test that should be put into the flags user, i.e.
18940 the bcc, scc, or cmov instruction. */
18941 return gen_rtx_fmt_ee (code, VOIDmode,
18942 gen_rtx_REG (intcmp_mode, FLAGS_REG),
18943 const0_rtx);
18946 static rtx
18947 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
18949 rtx ret;
18951 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
18952 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
18954 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
18956 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
18957 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18959 else
18960 ret = ix86_expand_int_compare (code, op0, op1);
18962 return ret;
18965 void
18966 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
18968 enum machine_mode mode = GET_MODE (op0);
18969 rtx tmp;
18971 switch (mode)
18973 case SFmode:
18974 case DFmode:
18975 case XFmode:
18976 case QImode:
18977 case HImode:
18978 case SImode:
18979 simple:
18980 tmp = ix86_expand_compare (code, op0, op1);
18981 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
18982 gen_rtx_LABEL_REF (VOIDmode, label),
18983 pc_rtx);
18984 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
18985 return;
18987 case DImode:
18988 if (TARGET_64BIT)
18989 goto simple;
18990 case TImode:
18991 /* Expand DImode branch into multiple compare+branch. */
18993 rtx lo[2], hi[2], label2;
18994 enum rtx_code code1, code2, code3;
18995 enum machine_mode submode;
18997 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
18999 tmp = op0, op0 = op1, op1 = tmp;
19000 code = swap_condition (code);
19003 split_double_mode (mode, &op0, 1, lo+0, hi+0);
19004 split_double_mode (mode, &op1, 1, lo+1, hi+1);
19006 submode = mode == DImode ? SImode : DImode;
19008 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
19009 avoid two branches. This costs one extra insn, so disable when
19010 optimizing for size. */
19012 if ((code == EQ || code == NE)
19013 && (!optimize_insn_for_size_p ()
19014 || hi[1] == const0_rtx || lo[1] == const0_rtx))
19016 rtx xor0, xor1;
19018 xor1 = hi[0];
19019 if (hi[1] != const0_rtx)
19020 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
19021 NULL_RTX, 0, OPTAB_WIDEN);
19023 xor0 = lo[0];
19024 if (lo[1] != const0_rtx)
19025 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
19026 NULL_RTX, 0, OPTAB_WIDEN);
19028 tmp = expand_binop (submode, ior_optab, xor1, xor0,
19029 NULL_RTX, 0, OPTAB_WIDEN);
19031 ix86_expand_branch (code, tmp, const0_rtx, label);
19032 return;
19035 /* Otherwise, if we are doing less-than or greater-or-equal-than,
19036 op1 is a constant and the low word is zero, then we can just
19037 examine the high word. Similarly for low word -1 and
19038 less-or-equal-than or greater-than. */
19040 if (CONST_INT_P (hi[1]))
19041 switch (code)
19043 case LT: case LTU: case GE: case GEU:
19044 if (lo[1] == const0_rtx)
19046 ix86_expand_branch (code, hi[0], hi[1], label);
19047 return;
19049 break;
19050 case LE: case LEU: case GT: case GTU:
19051 if (lo[1] == constm1_rtx)
19053 ix86_expand_branch (code, hi[0], hi[1], label);
19054 return;
19056 break;
19057 default:
19058 break;
19061 /* Otherwise, we need two or three jumps. */
19063 label2 = gen_label_rtx ();
19065 code1 = code;
19066 code2 = swap_condition (code);
19067 code3 = unsigned_condition (code);
19069 switch (code)
19071 case LT: case GT: case LTU: case GTU:
19072 break;
19074 case LE: code1 = LT; code2 = GT; break;
19075 case GE: code1 = GT; code2 = LT; break;
19076 case LEU: code1 = LTU; code2 = GTU; break;
19077 case GEU: code1 = GTU; code2 = LTU; break;
19079 case EQ: code1 = UNKNOWN; code2 = NE; break;
19080 case NE: code2 = UNKNOWN; break;
19082 default:
19083 gcc_unreachable ();
19087 * a < b =>
19088 * if (hi(a) < hi(b)) goto true;
19089 * if (hi(a) > hi(b)) goto false;
19090 * if (lo(a) < lo(b)) goto true;
19091 * false:
19094 if (code1 != UNKNOWN)
19095 ix86_expand_branch (code1, hi[0], hi[1], label);
19096 if (code2 != UNKNOWN)
19097 ix86_expand_branch (code2, hi[0], hi[1], label2);
19099 ix86_expand_branch (code3, lo[0], lo[1], label);
19101 if (code2 != UNKNOWN)
19102 emit_label (label2);
19103 return;
19106 default:
19107 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
19108 goto simple;
19112 /* Split branch based on floating point condition. */
19113 void
19114 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
19115 rtx target1, rtx target2, rtx tmp, rtx pushed)
19117 rtx condition;
19118 rtx i;
19120 if (target2 != pc_rtx)
19122 rtx tmp = target2;
19123 code = reverse_condition_maybe_unordered (code);
19124 target2 = target1;
19125 target1 = tmp;
19128 condition = ix86_expand_fp_compare (code, op1, op2,
19129 tmp);
19131 /* Remove pushed operand from stack. */
19132 if (pushed)
19133 ix86_free_from_memory (GET_MODE (pushed));
19135 i = emit_jump_insn (gen_rtx_SET
19136 (VOIDmode, pc_rtx,
19137 gen_rtx_IF_THEN_ELSE (VOIDmode,
19138 condition, target1, target2)));
19139 if (split_branch_probability >= 0)
19140 add_reg_note (i, REG_BR_PROB, GEN_INT (split_branch_probability));
19143 void
19144 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
19146 rtx ret;
19148 gcc_assert (GET_MODE (dest) == QImode);
19150 ret = ix86_expand_compare (code, op0, op1);
19151 PUT_MODE (ret, QImode);
19152 emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
19155 /* Expand comparison setting or clearing carry flag. Return true when
19156 successful and set pop for the operation. */
19157 static bool
19158 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
19160 enum machine_mode mode =
19161 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
19163 /* Do not handle double-mode compares that go through special path. */
19164 if (mode == (TARGET_64BIT ? TImode : DImode))
19165 return false;
19167 if (SCALAR_FLOAT_MODE_P (mode))
19169 rtx compare_op, compare_seq;
19171 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
19173 /* Shortcut: following common codes never translate
19174 into carry flag compares. */
19175 if (code == EQ || code == NE || code == UNEQ || code == LTGT
19176 || code == ORDERED || code == UNORDERED)
19177 return false;
19179 /* These comparisons require zero flag; swap operands so they won't. */
19180 if ((code == GT || code == UNLE || code == LE || code == UNGT)
19181 && !TARGET_IEEE_FP)
19183 rtx tmp = op0;
19184 op0 = op1;
19185 op1 = tmp;
19186 code = swap_condition (code);
19189 /* Try to expand the comparison and verify that we end up with
19190 carry flag based comparison. This fails to be true only when
19191 we decide to expand comparison using arithmetic that is not
19192 too common scenario. */
19193 start_sequence ();
19194 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
19195 compare_seq = get_insns ();
19196 end_sequence ();
19198 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
19199 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
19200 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
19201 else
19202 code = GET_CODE (compare_op);
19204 if (code != LTU && code != GEU)
19205 return false;
19207 emit_insn (compare_seq);
19208 *pop = compare_op;
19209 return true;
19212 if (!INTEGRAL_MODE_P (mode))
19213 return false;
19215 switch (code)
19217 case LTU:
19218 case GEU:
19219 break;
19221 /* Convert a==0 into (unsigned)a<1. */
19222 case EQ:
19223 case NE:
19224 if (op1 != const0_rtx)
19225 return false;
19226 op1 = const1_rtx;
19227 code = (code == EQ ? LTU : GEU);
19228 break;
19230 /* Convert a>b into b<a or a>=b-1. */
19231 case GTU:
19232 case LEU:
19233 if (CONST_INT_P (op1))
19235 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
19236 /* Bail out on overflow. We still can swap operands but that
19237 would force loading of the constant into register. */
19238 if (op1 == const0_rtx
19239 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
19240 return false;
19241 code = (code == GTU ? GEU : LTU);
19243 else
19245 rtx tmp = op1;
19246 op1 = op0;
19247 op0 = tmp;
19248 code = (code == GTU ? LTU : GEU);
19250 break;
19252 /* Convert a>=0 into (unsigned)a<0x80000000. */
19253 case LT:
19254 case GE:
19255 if (mode == DImode || op1 != const0_rtx)
19256 return false;
19257 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19258 code = (code == LT ? GEU : LTU);
19259 break;
19260 case LE:
19261 case GT:
19262 if (mode == DImode || op1 != constm1_rtx)
19263 return false;
19264 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19265 code = (code == LE ? GEU : LTU);
19266 break;
19268 default:
19269 return false;
19271 /* Swapping operands may cause constant to appear as first operand. */
19272 if (!nonimmediate_operand (op0, VOIDmode))
19274 if (!can_create_pseudo_p ())
19275 return false;
19276 op0 = force_reg (mode, op0);
19278 *pop = ix86_expand_compare (code, op0, op1);
19279 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
19280 return true;
19283 bool
19284 ix86_expand_int_movcc (rtx operands[])
19286 enum rtx_code code = GET_CODE (operands[1]), compare_code;
19287 rtx compare_seq, compare_op;
19288 enum machine_mode mode = GET_MODE (operands[0]);
19289 bool sign_bit_compare_p = false;
19290 rtx op0 = XEXP (operands[1], 0);
19291 rtx op1 = XEXP (operands[1], 1);
19293 if (GET_MODE (op0) == TImode
19294 || (GET_MODE (op0) == DImode
19295 && !TARGET_64BIT))
19296 return false;
19298 start_sequence ();
19299 compare_op = ix86_expand_compare (code, op0, op1);
19300 compare_seq = get_insns ();
19301 end_sequence ();
19303 compare_code = GET_CODE (compare_op);
19305 if ((op1 == const0_rtx && (code == GE || code == LT))
19306 || (op1 == constm1_rtx && (code == GT || code == LE)))
19307 sign_bit_compare_p = true;
19309 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
19310 HImode insns, we'd be swallowed in word prefix ops. */
19312 if ((mode != HImode || TARGET_FAST_PREFIX)
19313 && (mode != (TARGET_64BIT ? TImode : DImode))
19314 && CONST_INT_P (operands[2])
19315 && CONST_INT_P (operands[3]))
19317 rtx out = operands[0];
19318 HOST_WIDE_INT ct = INTVAL (operands[2]);
19319 HOST_WIDE_INT cf = INTVAL (operands[3]);
19320 HOST_WIDE_INT diff;
19322 diff = ct - cf;
19323 /* Sign bit compares are better done using shifts than we do by using
19324 sbb. */
19325 if (sign_bit_compare_p
19326 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
19328 /* Detect overlap between destination and compare sources. */
19329 rtx tmp = out;
19331 if (!sign_bit_compare_p)
19333 rtx flags;
19334 bool fpcmp = false;
19336 compare_code = GET_CODE (compare_op);
19338 flags = XEXP (compare_op, 0);
19340 if (GET_MODE (flags) == CCFPmode
19341 || GET_MODE (flags) == CCFPUmode)
19343 fpcmp = true;
19344 compare_code
19345 = ix86_fp_compare_code_to_integer (compare_code);
19348 /* To simplify rest of code, restrict to the GEU case. */
19349 if (compare_code == LTU)
19351 HOST_WIDE_INT tmp = ct;
19352 ct = cf;
19353 cf = tmp;
19354 compare_code = reverse_condition (compare_code);
19355 code = reverse_condition (code);
19357 else
19359 if (fpcmp)
19360 PUT_CODE (compare_op,
19361 reverse_condition_maybe_unordered
19362 (GET_CODE (compare_op)));
19363 else
19364 PUT_CODE (compare_op,
19365 reverse_condition (GET_CODE (compare_op)));
19367 diff = ct - cf;
19369 if (reg_overlap_mentioned_p (out, op0)
19370 || reg_overlap_mentioned_p (out, op1))
19371 tmp = gen_reg_rtx (mode);
19373 if (mode == DImode)
19374 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
19375 else
19376 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
19377 flags, compare_op));
19379 else
19381 if (code == GT || code == GE)
19382 code = reverse_condition (code);
19383 else
19385 HOST_WIDE_INT tmp = ct;
19386 ct = cf;
19387 cf = tmp;
19388 diff = ct - cf;
19390 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
19393 if (diff == 1)
19396 * cmpl op0,op1
19397 * sbbl dest,dest
19398 * [addl dest, ct]
19400 * Size 5 - 8.
19402 if (ct)
19403 tmp = expand_simple_binop (mode, PLUS,
19404 tmp, GEN_INT (ct),
19405 copy_rtx (tmp), 1, OPTAB_DIRECT);
19407 else if (cf == -1)
19410 * cmpl op0,op1
19411 * sbbl dest,dest
19412 * orl $ct, dest
19414 * Size 8.
19416 tmp = expand_simple_binop (mode, IOR,
19417 tmp, GEN_INT (ct),
19418 copy_rtx (tmp), 1, OPTAB_DIRECT);
19420 else if (diff == -1 && ct)
19423 * cmpl op0,op1
19424 * sbbl dest,dest
19425 * notl dest
19426 * [addl dest, cf]
19428 * Size 8 - 11.
19430 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19431 if (cf)
19432 tmp = expand_simple_binop (mode, PLUS,
19433 copy_rtx (tmp), GEN_INT (cf),
19434 copy_rtx (tmp), 1, OPTAB_DIRECT);
19436 else
19439 * cmpl op0,op1
19440 * sbbl dest,dest
19441 * [notl dest]
19442 * andl cf - ct, dest
19443 * [addl dest, ct]
19445 * Size 8 - 11.
19448 if (cf == 0)
19450 cf = ct;
19451 ct = 0;
19452 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19455 tmp = expand_simple_binop (mode, AND,
19456 copy_rtx (tmp),
19457 gen_int_mode (cf - ct, mode),
19458 copy_rtx (tmp), 1, OPTAB_DIRECT);
19459 if (ct)
19460 tmp = expand_simple_binop (mode, PLUS,
19461 copy_rtx (tmp), GEN_INT (ct),
19462 copy_rtx (tmp), 1, OPTAB_DIRECT);
19465 if (!rtx_equal_p (tmp, out))
19466 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
19468 return true;
19471 if (diff < 0)
19473 enum machine_mode cmp_mode = GET_MODE (op0);
19475 HOST_WIDE_INT tmp;
19476 tmp = ct, ct = cf, cf = tmp;
19477 diff = -diff;
19479 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19481 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19483 /* We may be reversing unordered compare to normal compare, that
19484 is not valid in general (we may convert non-trapping condition
19485 to trapping one), however on i386 we currently emit all
19486 comparisons unordered. */
19487 compare_code = reverse_condition_maybe_unordered (compare_code);
19488 code = reverse_condition_maybe_unordered (code);
19490 else
19492 compare_code = reverse_condition (compare_code);
19493 code = reverse_condition (code);
19497 compare_code = UNKNOWN;
19498 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
19499 && CONST_INT_P (op1))
19501 if (op1 == const0_rtx
19502 && (code == LT || code == GE))
19503 compare_code = code;
19504 else if (op1 == constm1_rtx)
19506 if (code == LE)
19507 compare_code = LT;
19508 else if (code == GT)
19509 compare_code = GE;
19513 /* Optimize dest = (op0 < 0) ? -1 : cf. */
19514 if (compare_code != UNKNOWN
19515 && GET_MODE (op0) == GET_MODE (out)
19516 && (cf == -1 || ct == -1))
19518 /* If lea code below could be used, only optimize
19519 if it results in a 2 insn sequence. */
19521 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
19522 || diff == 3 || diff == 5 || diff == 9)
19523 || (compare_code == LT && ct == -1)
19524 || (compare_code == GE && cf == -1))
19527 * notl op1 (if necessary)
19528 * sarl $31, op1
19529 * orl cf, op1
19531 if (ct != -1)
19533 cf = ct;
19534 ct = -1;
19535 code = reverse_condition (code);
19538 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19540 out = expand_simple_binop (mode, IOR,
19541 out, GEN_INT (cf),
19542 out, 1, OPTAB_DIRECT);
19543 if (out != operands[0])
19544 emit_move_insn (operands[0], out);
19546 return true;
19551 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
19552 || diff == 3 || diff == 5 || diff == 9)
19553 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
19554 && (mode != DImode
19555 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
19558 * xorl dest,dest
19559 * cmpl op1,op2
19560 * setcc dest
19561 * lea cf(dest*(ct-cf)),dest
19563 * Size 14.
19565 * This also catches the degenerate setcc-only case.
19568 rtx tmp;
19569 int nops;
19571 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19573 nops = 0;
19574 /* On x86_64 the lea instruction operates on Pmode, so we need
19575 to get arithmetics done in proper mode to match. */
19576 if (diff == 1)
19577 tmp = copy_rtx (out);
19578 else
19580 rtx out1;
19581 out1 = copy_rtx (out);
19582 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
19583 nops++;
19584 if (diff & 1)
19586 tmp = gen_rtx_PLUS (mode, tmp, out1);
19587 nops++;
19590 if (cf != 0)
19592 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
19593 nops++;
19595 if (!rtx_equal_p (tmp, out))
19597 if (nops == 1)
19598 out = force_operand (tmp, copy_rtx (out));
19599 else
19600 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
19602 if (!rtx_equal_p (out, operands[0]))
19603 emit_move_insn (operands[0], copy_rtx (out));
19605 return true;
19609 * General case: Jumpful:
19610 * xorl dest,dest cmpl op1, op2
19611 * cmpl op1, op2 movl ct, dest
19612 * setcc dest jcc 1f
19613 * decl dest movl cf, dest
19614 * andl (cf-ct),dest 1:
19615 * addl ct,dest
19617 * Size 20. Size 14.
19619 * This is reasonably steep, but branch mispredict costs are
19620 * high on modern cpus, so consider failing only if optimizing
19621 * for space.
19624 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19625 && BRANCH_COST (optimize_insn_for_speed_p (),
19626 false) >= 2)
19628 if (cf == 0)
19630 enum machine_mode cmp_mode = GET_MODE (op0);
19632 cf = ct;
19633 ct = 0;
19635 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19637 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19639 /* We may be reversing unordered compare to normal compare,
19640 that is not valid in general (we may convert non-trapping
19641 condition to trapping one), however on i386 we currently
19642 emit all comparisons unordered. */
19643 code = reverse_condition_maybe_unordered (code);
19645 else
19647 code = reverse_condition (code);
19648 if (compare_code != UNKNOWN)
19649 compare_code = reverse_condition (compare_code);
19653 if (compare_code != UNKNOWN)
19655 /* notl op1 (if needed)
19656 sarl $31, op1
19657 andl (cf-ct), op1
19658 addl ct, op1
19660 For x < 0 (resp. x <= -1) there will be no notl,
19661 so if possible swap the constants to get rid of the
19662 complement.
19663 True/false will be -1/0 while code below (store flag
19664 followed by decrement) is 0/-1, so the constants need
19665 to be exchanged once more. */
19667 if (compare_code == GE || !cf)
19669 code = reverse_condition (code);
19670 compare_code = LT;
19672 else
19674 HOST_WIDE_INT tmp = cf;
19675 cf = ct;
19676 ct = tmp;
19679 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19681 else
19683 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19685 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
19686 constm1_rtx,
19687 copy_rtx (out), 1, OPTAB_DIRECT);
19690 out = expand_simple_binop (mode, AND, copy_rtx (out),
19691 gen_int_mode (cf - ct, mode),
19692 copy_rtx (out), 1, OPTAB_DIRECT);
19693 if (ct)
19694 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
19695 copy_rtx (out), 1, OPTAB_DIRECT);
19696 if (!rtx_equal_p (out, operands[0]))
19697 emit_move_insn (operands[0], copy_rtx (out));
19699 return true;
19703 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19705 /* Try a few things more with specific constants and a variable. */
19707 optab op;
19708 rtx var, orig_out, out, tmp;
19710 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
19711 return false;
19713 /* If one of the two operands is an interesting constant, load a
19714 constant with the above and mask it in with a logical operation. */
19716 if (CONST_INT_P (operands[2]))
19718 var = operands[3];
19719 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
19720 operands[3] = constm1_rtx, op = and_optab;
19721 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
19722 operands[3] = const0_rtx, op = ior_optab;
19723 else
19724 return false;
19726 else if (CONST_INT_P (operands[3]))
19728 var = operands[2];
19729 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
19730 operands[2] = constm1_rtx, op = and_optab;
19731 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
19732 operands[2] = const0_rtx, op = ior_optab;
19733 else
19734 return false;
19736 else
19737 return false;
19739 orig_out = operands[0];
19740 tmp = gen_reg_rtx (mode);
19741 operands[0] = tmp;
19743 /* Recurse to get the constant loaded. */
19744 if (ix86_expand_int_movcc (operands) == 0)
19745 return false;
19747 /* Mask in the interesting variable. */
19748 out = expand_binop (mode, op, var, tmp, orig_out, 0,
19749 OPTAB_WIDEN);
19750 if (!rtx_equal_p (out, orig_out))
19751 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
19753 return true;
19757 * For comparison with above,
19759 * movl cf,dest
19760 * movl ct,tmp
19761 * cmpl op1,op2
19762 * cmovcc tmp,dest
19764 * Size 15.
19767 if (! nonimmediate_operand (operands[2], mode))
19768 operands[2] = force_reg (mode, operands[2]);
19769 if (! nonimmediate_operand (operands[3], mode))
19770 operands[3] = force_reg (mode, operands[3]);
19772 if (! register_operand (operands[2], VOIDmode)
19773 && (mode == QImode
19774 || ! register_operand (operands[3], VOIDmode)))
19775 operands[2] = force_reg (mode, operands[2]);
19777 if (mode == QImode
19778 && ! register_operand (operands[3], VOIDmode))
19779 operands[3] = force_reg (mode, operands[3]);
19781 emit_insn (compare_seq);
19782 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19783 gen_rtx_IF_THEN_ELSE (mode,
19784 compare_op, operands[2],
19785 operands[3])));
19786 return true;
19789 /* Swap, force into registers, or otherwise massage the two operands
19790 to an sse comparison with a mask result. Thus we differ a bit from
19791 ix86_prepare_fp_compare_args which expects to produce a flags result.
19793 The DEST operand exists to help determine whether to commute commutative
19794 operators. The POP0/POP1 operands are updated in place. The new
19795 comparison code is returned, or UNKNOWN if not implementable. */
19797 static enum rtx_code
19798 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
19799 rtx *pop0, rtx *pop1)
19801 rtx tmp;
19803 switch (code)
19805 case LTGT:
19806 case UNEQ:
19807 /* AVX supports all the needed comparisons. */
19808 if (TARGET_AVX)
19809 break;
19810 /* We have no LTGT as an operator. We could implement it with
19811 NE & ORDERED, but this requires an extra temporary. It's
19812 not clear that it's worth it. */
19813 return UNKNOWN;
19815 case LT:
19816 case LE:
19817 case UNGT:
19818 case UNGE:
19819 /* These are supported directly. */
19820 break;
19822 case EQ:
19823 case NE:
19824 case UNORDERED:
19825 case ORDERED:
19826 /* AVX has 3 operand comparisons, no need to swap anything. */
19827 if (TARGET_AVX)
19828 break;
19829 /* For commutative operators, try to canonicalize the destination
19830 operand to be first in the comparison - this helps reload to
19831 avoid extra moves. */
19832 if (!dest || !rtx_equal_p (dest, *pop1))
19833 break;
19834 /* FALLTHRU */
19836 case GE:
19837 case GT:
19838 case UNLE:
19839 case UNLT:
19840 /* These are not supported directly before AVX, and furthermore
19841 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
19842 comparison operands to transform into something that is
19843 supported. */
19844 tmp = *pop0;
19845 *pop0 = *pop1;
19846 *pop1 = tmp;
19847 code = swap_condition (code);
19848 break;
19850 default:
19851 gcc_unreachable ();
19854 return code;
19857 /* Detect conditional moves that exactly match min/max operational
19858 semantics. Note that this is IEEE safe, as long as we don't
19859 interchange the operands.
19861 Returns FALSE if this conditional move doesn't match a MIN/MAX,
19862 and TRUE if the operation is successful and instructions are emitted. */
19864 static bool
19865 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
19866 rtx cmp_op1, rtx if_true, rtx if_false)
19868 enum machine_mode mode;
19869 bool is_min;
19870 rtx tmp;
19872 if (code == LT)
19874 else if (code == UNGE)
19876 tmp = if_true;
19877 if_true = if_false;
19878 if_false = tmp;
19880 else
19881 return false;
19883 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
19884 is_min = true;
19885 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
19886 is_min = false;
19887 else
19888 return false;
19890 mode = GET_MODE (dest);
19892 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
19893 but MODE may be a vector mode and thus not appropriate. */
19894 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
19896 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
19897 rtvec v;
19899 if_true = force_reg (mode, if_true);
19900 v = gen_rtvec (2, if_true, if_false);
19901 tmp = gen_rtx_UNSPEC (mode, v, u);
19903 else
19905 code = is_min ? SMIN : SMAX;
19906 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
19909 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
19910 return true;
19913 /* Expand an sse vector comparison. Return the register with the result. */
19915 static rtx
19916 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
19917 rtx op_true, rtx op_false)
19919 enum machine_mode mode = GET_MODE (dest);
19920 enum machine_mode cmp_mode = GET_MODE (cmp_op0);
19921 rtx x;
19923 cmp_op0 = force_reg (cmp_mode, cmp_op0);
19924 if (!nonimmediate_operand (cmp_op1, cmp_mode))
19925 cmp_op1 = force_reg (cmp_mode, cmp_op1);
19927 if (optimize
19928 || reg_overlap_mentioned_p (dest, op_true)
19929 || reg_overlap_mentioned_p (dest, op_false))
19930 dest = gen_reg_rtx (mode);
19932 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
19933 if (cmp_mode != mode)
19935 x = force_reg (cmp_mode, x);
19936 convert_move (dest, x, false);
19938 else
19939 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19941 return dest;
19944 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
19945 operations. This is used for both scalar and vector conditional moves. */
19947 static void
19948 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
19950 enum machine_mode mode = GET_MODE (dest);
19951 rtx t2, t3, x;
19953 if (vector_all_ones_operand (op_true, mode)
19954 && rtx_equal_p (op_false, CONST0_RTX (mode)))
19956 emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
19958 else if (op_false == CONST0_RTX (mode))
19960 op_true = force_reg (mode, op_true);
19961 x = gen_rtx_AND (mode, cmp, op_true);
19962 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19964 else if (op_true == CONST0_RTX (mode))
19966 op_false = force_reg (mode, op_false);
19967 x = gen_rtx_NOT (mode, cmp);
19968 x = gen_rtx_AND (mode, x, op_false);
19969 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19971 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
19973 op_false = force_reg (mode, op_false);
19974 x = gen_rtx_IOR (mode, cmp, op_false);
19975 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19977 else if (TARGET_XOP)
19979 op_true = force_reg (mode, op_true);
19981 if (!nonimmediate_operand (op_false, mode))
19982 op_false = force_reg (mode, op_false);
19984 emit_insn (gen_rtx_SET (mode, dest,
19985 gen_rtx_IF_THEN_ELSE (mode, cmp,
19986 op_true,
19987 op_false)));
19989 else
19991 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
19993 if (!nonimmediate_operand (op_true, mode))
19994 op_true = force_reg (mode, op_true);
19996 op_false = force_reg (mode, op_false);
19998 switch (mode)
20000 case V4SFmode:
20001 if (TARGET_SSE4_1)
20002 gen = gen_sse4_1_blendvps;
20003 break;
20004 case V2DFmode:
20005 if (TARGET_SSE4_1)
20006 gen = gen_sse4_1_blendvpd;
20007 break;
20008 case V16QImode:
20009 case V8HImode:
20010 case V4SImode:
20011 case V2DImode:
20012 if (TARGET_SSE4_1)
20014 gen = gen_sse4_1_pblendvb;
20015 dest = gen_lowpart (V16QImode, dest);
20016 op_false = gen_lowpart (V16QImode, op_false);
20017 op_true = gen_lowpart (V16QImode, op_true);
20018 cmp = gen_lowpart (V16QImode, cmp);
20020 break;
20021 case V8SFmode:
20022 if (TARGET_AVX)
20023 gen = gen_avx_blendvps256;
20024 break;
20025 case V4DFmode:
20026 if (TARGET_AVX)
20027 gen = gen_avx_blendvpd256;
20028 break;
20029 case V32QImode:
20030 case V16HImode:
20031 case V8SImode:
20032 case V4DImode:
20033 if (TARGET_AVX2)
20035 gen = gen_avx2_pblendvb;
20036 dest = gen_lowpart (V32QImode, dest);
20037 op_false = gen_lowpart (V32QImode, op_false);
20038 op_true = gen_lowpart (V32QImode, op_true);
20039 cmp = gen_lowpart (V32QImode, cmp);
20041 break;
20042 default:
20043 break;
20046 if (gen != NULL)
20047 emit_insn (gen (dest, op_false, op_true, cmp));
20048 else
20050 op_true = force_reg (mode, op_true);
20052 t2 = gen_reg_rtx (mode);
20053 if (optimize)
20054 t3 = gen_reg_rtx (mode);
20055 else
20056 t3 = dest;
20058 x = gen_rtx_AND (mode, op_true, cmp);
20059 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
20061 x = gen_rtx_NOT (mode, cmp);
20062 x = gen_rtx_AND (mode, x, op_false);
20063 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
20065 x = gen_rtx_IOR (mode, t3, t2);
20066 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20071 /* Expand a floating-point conditional move. Return true if successful. */
20073 bool
20074 ix86_expand_fp_movcc (rtx operands[])
20076 enum machine_mode mode = GET_MODE (operands[0]);
20077 enum rtx_code code = GET_CODE (operands[1]);
20078 rtx tmp, compare_op;
20079 rtx op0 = XEXP (operands[1], 0);
20080 rtx op1 = XEXP (operands[1], 1);
20082 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
20084 enum machine_mode cmode;
20086 /* Since we've no cmove for sse registers, don't force bad register
20087 allocation just to gain access to it. Deny movcc when the
20088 comparison mode doesn't match the move mode. */
20089 cmode = GET_MODE (op0);
20090 if (cmode == VOIDmode)
20091 cmode = GET_MODE (op1);
20092 if (cmode != mode)
20093 return false;
20095 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
20096 if (code == UNKNOWN)
20097 return false;
20099 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
20100 operands[2], operands[3]))
20101 return true;
20103 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
20104 operands[2], operands[3]);
20105 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
20106 return true;
20109 if (GET_MODE (op0) == TImode
20110 || (GET_MODE (op0) == DImode
20111 && !TARGET_64BIT))
20112 return false;
20114 /* The floating point conditional move instructions don't directly
20115 support conditions resulting from a signed integer comparison. */
20117 compare_op = ix86_expand_compare (code, op0, op1);
20118 if (!fcmov_comparison_operator (compare_op, VOIDmode))
20120 tmp = gen_reg_rtx (QImode);
20121 ix86_expand_setcc (tmp, code, op0, op1);
20123 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
20126 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
20127 gen_rtx_IF_THEN_ELSE (mode, compare_op,
20128 operands[2], operands[3])));
20130 return true;
20133 /* Expand a floating-point vector conditional move; a vcond operation
20134 rather than a movcc operation. */
20136 bool
20137 ix86_expand_fp_vcond (rtx operands[])
20139 enum rtx_code code = GET_CODE (operands[3]);
20140 rtx cmp;
20142 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
20143 &operands[4], &operands[5]);
20144 if (code == UNKNOWN)
20146 rtx temp;
20147 switch (GET_CODE (operands[3]))
20149 case LTGT:
20150 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
20151 operands[5], operands[0], operands[0]);
20152 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
20153 operands[5], operands[1], operands[2]);
20154 code = AND;
20155 break;
20156 case UNEQ:
20157 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
20158 operands[5], operands[0], operands[0]);
20159 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
20160 operands[5], operands[1], operands[2]);
20161 code = IOR;
20162 break;
20163 default:
20164 gcc_unreachable ();
20166 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
20167 OPTAB_DIRECT);
20168 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20169 return true;
20172 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
20173 operands[5], operands[1], operands[2]))
20174 return true;
20176 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
20177 operands[1], operands[2]);
20178 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20179 return true;
20182 /* Expand a signed/unsigned integral vector conditional move. */
20184 bool
20185 ix86_expand_int_vcond (rtx operands[])
20187 enum machine_mode data_mode = GET_MODE (operands[0]);
20188 enum machine_mode mode = GET_MODE (operands[4]);
20189 enum rtx_code code = GET_CODE (operands[3]);
20190 bool negate = false;
20191 rtx x, cop0, cop1;
20193 cop0 = operands[4];
20194 cop1 = operands[5];
20196 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
20197 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
20198 if ((code == LT || code == GE)
20199 && data_mode == mode
20200 && cop1 == CONST0_RTX (mode)
20201 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
20202 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
20203 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
20204 && (GET_MODE_SIZE (data_mode) == 16
20205 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
20207 rtx negop = operands[2 - (code == LT)];
20208 int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
20209 if (negop == CONST1_RTX (data_mode))
20211 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
20212 operands[0], 1, OPTAB_DIRECT);
20213 if (res != operands[0])
20214 emit_move_insn (operands[0], res);
20215 return true;
20217 else if (GET_MODE_INNER (data_mode) != DImode
20218 && vector_all_ones_operand (negop, data_mode))
20220 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
20221 operands[0], 0, OPTAB_DIRECT);
20222 if (res != operands[0])
20223 emit_move_insn (operands[0], res);
20224 return true;
20228 if (!nonimmediate_operand (cop1, mode))
20229 cop1 = force_reg (mode, cop1);
20230 if (!general_operand (operands[1], data_mode))
20231 operands[1] = force_reg (data_mode, operands[1]);
20232 if (!general_operand (operands[2], data_mode))
20233 operands[2] = force_reg (data_mode, operands[2]);
20235 /* XOP supports all of the comparisons on all 128-bit vector int types. */
20236 if (TARGET_XOP
20237 && (mode == V16QImode || mode == V8HImode
20238 || mode == V4SImode || mode == V2DImode))
20240 else
20242 /* Canonicalize the comparison to EQ, GT, GTU. */
20243 switch (code)
20245 case EQ:
20246 case GT:
20247 case GTU:
20248 break;
20250 case NE:
20251 case LE:
20252 case LEU:
20253 code = reverse_condition (code);
20254 negate = true;
20255 break;
20257 case GE:
20258 case GEU:
20259 code = reverse_condition (code);
20260 negate = true;
20261 /* FALLTHRU */
20263 case LT:
20264 case LTU:
20265 code = swap_condition (code);
20266 x = cop0, cop0 = cop1, cop1 = x;
20267 break;
20269 default:
20270 gcc_unreachable ();
20273 /* Only SSE4.1/SSE4.2 supports V2DImode. */
20274 if (mode == V2DImode)
20276 switch (code)
20278 case EQ:
20279 /* SSE4.1 supports EQ. */
20280 if (!TARGET_SSE4_1)
20281 return false;
20282 break;
20284 case GT:
20285 case GTU:
20286 /* SSE4.2 supports GT/GTU. */
20287 if (!TARGET_SSE4_2)
20288 return false;
20289 break;
20291 default:
20292 gcc_unreachable ();
20296 /* Unsigned parallel compare is not supported by the hardware.
20297 Play some tricks to turn this into a signed comparison
20298 against 0. */
20299 if (code == GTU)
20301 cop0 = force_reg (mode, cop0);
20303 switch (mode)
20305 case V8SImode:
20306 case V4DImode:
20307 case V4SImode:
20308 case V2DImode:
20310 rtx t1, t2, mask;
20311 rtx (*gen_sub3) (rtx, rtx, rtx);
20313 switch (mode)
20315 case V8SImode: gen_sub3 = gen_subv8si3; break;
20316 case V4DImode: gen_sub3 = gen_subv4di3; break;
20317 case V4SImode: gen_sub3 = gen_subv4si3; break;
20318 case V2DImode: gen_sub3 = gen_subv2di3; break;
20319 default:
20320 gcc_unreachable ();
20322 /* Subtract (-(INT MAX) - 1) from both operands to make
20323 them signed. */
20324 mask = ix86_build_signbit_mask (mode, true, false);
20325 t1 = gen_reg_rtx (mode);
20326 emit_insn (gen_sub3 (t1, cop0, mask));
20328 t2 = gen_reg_rtx (mode);
20329 emit_insn (gen_sub3 (t2, cop1, mask));
20331 cop0 = t1;
20332 cop1 = t2;
20333 code = GT;
20335 break;
20337 case V32QImode:
20338 case V16HImode:
20339 case V16QImode:
20340 case V8HImode:
20341 /* Perform a parallel unsigned saturating subtraction. */
20342 x = gen_reg_rtx (mode);
20343 emit_insn (gen_rtx_SET (VOIDmode, x,
20344 gen_rtx_US_MINUS (mode, cop0, cop1)));
20346 cop0 = x;
20347 cop1 = CONST0_RTX (mode);
20348 code = EQ;
20349 negate = !negate;
20350 break;
20352 default:
20353 gcc_unreachable ();
20358 /* Allow the comparison to be done in one mode, but the movcc to
20359 happen in another mode. */
20360 if (data_mode == mode)
20362 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
20363 operands[1+negate], operands[2-negate]);
20365 else
20367 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
20368 x = ix86_expand_sse_cmp (gen_lowpart (mode, operands[0]),
20369 code, cop0, cop1,
20370 operands[1+negate], operands[2-negate]);
20371 x = gen_lowpart (data_mode, x);
20374 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
20375 operands[2-negate]);
20376 return true;
20379 /* Expand a variable vector permutation. */
20381 void
20382 ix86_expand_vec_perm (rtx operands[])
20384 rtx target = operands[0];
20385 rtx op0 = operands[1];
20386 rtx op1 = operands[2];
20387 rtx mask = operands[3];
20388 rtx t1, t2, t3, t4, vt, vt2, vec[32];
20389 enum machine_mode mode = GET_MODE (op0);
20390 enum machine_mode maskmode = GET_MODE (mask);
20391 int w, e, i;
20392 bool one_operand_shuffle = rtx_equal_p (op0, op1);
20394 /* Number of elements in the vector. */
20395 w = GET_MODE_NUNITS (mode);
20396 e = GET_MODE_UNIT_SIZE (mode);
20397 gcc_assert (w <= 32);
20399 if (TARGET_AVX2)
20401 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
20403 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
20404 an constant shuffle operand. With a tiny bit of effort we can
20405 use VPERMD instead. A re-interpretation stall for V4DFmode is
20406 unfortunate but there's no avoiding it.
20407 Similarly for V16HImode we don't have instructions for variable
20408 shuffling, while for V32QImode we can use after preparing suitable
20409 masks vpshufb; vpshufb; vpermq; vpor. */
20411 if (mode == V16HImode)
20413 maskmode = mode = V32QImode;
20414 w = 32;
20415 e = 1;
20417 else
20419 maskmode = mode = V8SImode;
20420 w = 8;
20421 e = 4;
20423 t1 = gen_reg_rtx (maskmode);
20425 /* Replicate the low bits of the V4DImode mask into V8SImode:
20426 mask = { A B C D }
20427 t1 = { A A B B C C D D }. */
20428 for (i = 0; i < w / 2; ++i)
20429 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
20430 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20431 vt = force_reg (maskmode, vt);
20432 mask = gen_lowpart (maskmode, mask);
20433 if (maskmode == V8SImode)
20434 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
20435 else
20436 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
20438 /* Multiply the shuffle indicies by two. */
20439 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
20440 OPTAB_DIRECT);
20442 /* Add one to the odd shuffle indicies:
20443 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
20444 for (i = 0; i < w / 2; ++i)
20446 vec[i * 2] = const0_rtx;
20447 vec[i * 2 + 1] = const1_rtx;
20449 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20450 vt = force_const_mem (maskmode, vt);
20451 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
20452 OPTAB_DIRECT);
20454 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
20455 operands[3] = mask = t1;
20456 target = gen_lowpart (mode, target);
20457 op0 = gen_lowpart (mode, op0);
20458 op1 = gen_lowpart (mode, op1);
20461 switch (mode)
20463 case V8SImode:
20464 /* The VPERMD and VPERMPS instructions already properly ignore
20465 the high bits of the shuffle elements. No need for us to
20466 perform an AND ourselves. */
20467 if (one_operand_shuffle)
20468 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
20469 else
20471 t1 = gen_reg_rtx (V8SImode);
20472 t2 = gen_reg_rtx (V8SImode);
20473 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
20474 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
20475 goto merge_two;
20477 return;
20479 case V8SFmode:
20480 mask = gen_lowpart (V8SFmode, mask);
20481 if (one_operand_shuffle)
20482 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
20483 else
20485 t1 = gen_reg_rtx (V8SFmode);
20486 t2 = gen_reg_rtx (V8SFmode);
20487 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
20488 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
20489 goto merge_two;
20491 return;
20493 case V4SImode:
20494 /* By combining the two 128-bit input vectors into one 256-bit
20495 input vector, we can use VPERMD and VPERMPS for the full
20496 two-operand shuffle. */
20497 t1 = gen_reg_rtx (V8SImode);
20498 t2 = gen_reg_rtx (V8SImode);
20499 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
20500 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
20501 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
20502 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
20503 return;
20505 case V4SFmode:
20506 t1 = gen_reg_rtx (V8SFmode);
20507 t2 = gen_reg_rtx (V8SImode);
20508 mask = gen_lowpart (V4SImode, mask);
20509 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
20510 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
20511 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
20512 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
20513 return;
20515 case V32QImode:
20516 t1 = gen_reg_rtx (V32QImode);
20517 t2 = gen_reg_rtx (V32QImode);
20518 t3 = gen_reg_rtx (V32QImode);
20519 vt2 = GEN_INT (128);
20520 for (i = 0; i < 32; i++)
20521 vec[i] = vt2;
20522 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20523 vt = force_reg (V32QImode, vt);
20524 for (i = 0; i < 32; i++)
20525 vec[i] = i < 16 ? vt2 : const0_rtx;
20526 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20527 vt2 = force_reg (V32QImode, vt2);
20528 /* From mask create two adjusted masks, which contain the same
20529 bits as mask in the low 7 bits of each vector element.
20530 The first mask will have the most significant bit clear
20531 if it requests element from the same 128-bit lane
20532 and MSB set if it requests element from the other 128-bit lane.
20533 The second mask will have the opposite values of the MSB,
20534 and additionally will have its 128-bit lanes swapped.
20535 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
20536 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
20537 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
20538 stands for other 12 bytes. */
20539 /* The bit whether element is from the same lane or the other
20540 lane is bit 4, so shift it up by 3 to the MSB position. */
20541 emit_insn (gen_ashlv4di3 (gen_lowpart (V4DImode, t1),
20542 gen_lowpart (V4DImode, mask),
20543 GEN_INT (3)));
20544 /* Clear MSB bits from the mask just in case it had them set. */
20545 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
20546 /* After this t1 will have MSB set for elements from other lane. */
20547 emit_insn (gen_xorv32qi3 (t1, t1, vt2));
20548 /* Clear bits other than MSB. */
20549 emit_insn (gen_andv32qi3 (t1, t1, vt));
20550 /* Or in the lower bits from mask into t3. */
20551 emit_insn (gen_iorv32qi3 (t3, t1, t2));
20552 /* And invert MSB bits in t1, so MSB is set for elements from the same
20553 lane. */
20554 emit_insn (gen_xorv32qi3 (t1, t1, vt));
20555 /* Swap 128-bit lanes in t3. */
20556 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20557 gen_lowpart (V4DImode, t3),
20558 const2_rtx, GEN_INT (3),
20559 const0_rtx, const1_rtx));
20560 /* And or in the lower bits from mask into t1. */
20561 emit_insn (gen_iorv32qi3 (t1, t1, t2));
20562 if (one_operand_shuffle)
20564 /* Each of these shuffles will put 0s in places where
20565 element from the other 128-bit lane is needed, otherwise
20566 will shuffle in the requested value. */
20567 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3));
20568 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
20569 /* For t3 the 128-bit lanes are swapped again. */
20570 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20571 gen_lowpart (V4DImode, t3),
20572 const2_rtx, GEN_INT (3),
20573 const0_rtx, const1_rtx));
20574 /* And oring both together leads to the result. */
20575 emit_insn (gen_iorv32qi3 (target, t1, t3));
20576 return;
20579 t4 = gen_reg_rtx (V32QImode);
20580 /* Similarly to the above one_operand_shuffle code,
20581 just for repeated twice for each operand. merge_two:
20582 code will merge the two results together. */
20583 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3));
20584 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3));
20585 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
20586 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
20587 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4),
20588 gen_lowpart (V4DImode, t4),
20589 const2_rtx, GEN_INT (3),
20590 const0_rtx, const1_rtx));
20591 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20592 gen_lowpart (V4DImode, t3),
20593 const2_rtx, GEN_INT (3),
20594 const0_rtx, const1_rtx));
20595 emit_insn (gen_iorv32qi3 (t4, t2, t4));
20596 emit_insn (gen_iorv32qi3 (t3, t1, t3));
20597 t1 = t4;
20598 t2 = t3;
20599 goto merge_two;
20601 default:
20602 gcc_assert (GET_MODE_SIZE (mode) <= 16);
20603 break;
20607 if (TARGET_XOP)
20609 /* The XOP VPPERM insn supports three inputs. By ignoring the
20610 one_operand_shuffle special case, we avoid creating another
20611 set of constant vectors in memory. */
20612 one_operand_shuffle = false;
20614 /* mask = mask & {2*w-1, ...} */
20615 vt = GEN_INT (2*w - 1);
20617 else
20619 /* mask = mask & {w-1, ...} */
20620 vt = GEN_INT (w - 1);
20623 for (i = 0; i < w; i++)
20624 vec[i] = vt;
20625 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20626 mask = expand_simple_binop (maskmode, AND, mask, vt,
20627 NULL_RTX, 0, OPTAB_DIRECT);
20629 /* For non-QImode operations, convert the word permutation control
20630 into a byte permutation control. */
20631 if (mode != V16QImode)
20633 mask = expand_simple_binop (maskmode, ASHIFT, mask,
20634 GEN_INT (exact_log2 (e)),
20635 NULL_RTX, 0, OPTAB_DIRECT);
20637 /* Convert mask to vector of chars. */
20638 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
20640 /* Replicate each of the input bytes into byte positions:
20641 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
20642 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
20643 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
20644 for (i = 0; i < 16; ++i)
20645 vec[i] = GEN_INT (i/e * e);
20646 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20647 vt = force_const_mem (V16QImode, vt);
20648 if (TARGET_XOP)
20649 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
20650 else
20651 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
20653 /* Convert it into the byte positions by doing
20654 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
20655 for (i = 0; i < 16; ++i)
20656 vec[i] = GEN_INT (i % e);
20657 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20658 vt = force_const_mem (V16QImode, vt);
20659 emit_insn (gen_addv16qi3 (mask, mask, vt));
20662 /* The actual shuffle operations all operate on V16QImode. */
20663 op0 = gen_lowpart (V16QImode, op0);
20664 op1 = gen_lowpart (V16QImode, op1);
20665 target = gen_lowpart (V16QImode, target);
20667 if (TARGET_XOP)
20669 emit_insn (gen_xop_pperm (target, op0, op1, mask));
20671 else if (one_operand_shuffle)
20673 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
20675 else
20677 rtx xops[6];
20678 bool ok;
20680 /* Shuffle the two input vectors independently. */
20681 t1 = gen_reg_rtx (V16QImode);
20682 t2 = gen_reg_rtx (V16QImode);
20683 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
20684 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
20686 merge_two:
20687 /* Then merge them together. The key is whether any given control
20688 element contained a bit set that indicates the second word. */
20689 mask = operands[3];
20690 vt = GEN_INT (w);
20691 if (maskmode == V2DImode && !TARGET_SSE4_1)
20693 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
20694 more shuffle to convert the V2DI input mask into a V4SI
20695 input mask. At which point the masking that expand_int_vcond
20696 will work as desired. */
20697 rtx t3 = gen_reg_rtx (V4SImode);
20698 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
20699 const0_rtx, const0_rtx,
20700 const2_rtx, const2_rtx));
20701 mask = t3;
20702 maskmode = V4SImode;
20703 e = w = 4;
20706 for (i = 0; i < w; i++)
20707 vec[i] = vt;
20708 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20709 vt = force_reg (maskmode, vt);
20710 mask = expand_simple_binop (maskmode, AND, mask, vt,
20711 NULL_RTX, 0, OPTAB_DIRECT);
20713 xops[0] = gen_lowpart (mode, operands[0]);
20714 xops[1] = gen_lowpart (mode, t2);
20715 xops[2] = gen_lowpart (mode, t1);
20716 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
20717 xops[4] = mask;
20718 xops[5] = vt;
20719 ok = ix86_expand_int_vcond (xops);
20720 gcc_assert (ok);
20724 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
20725 true if we should do zero extension, else sign extension. HIGH_P is
20726 true if we want the N/2 high elements, else the low elements. */
20728 void
20729 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
20731 enum machine_mode imode = GET_MODE (src);
20732 rtx tmp;
20734 if (TARGET_SSE4_1)
20736 rtx (*unpack)(rtx, rtx);
20737 rtx (*extract)(rtx, rtx) = NULL;
20738 enum machine_mode halfmode = BLKmode;
20740 switch (imode)
20742 case V32QImode:
20743 if (unsigned_p)
20744 unpack = gen_avx2_zero_extendv16qiv16hi2;
20745 else
20746 unpack = gen_avx2_sign_extendv16qiv16hi2;
20747 halfmode = V16QImode;
20748 extract
20749 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
20750 break;
20751 case V16HImode:
20752 if (unsigned_p)
20753 unpack = gen_avx2_zero_extendv8hiv8si2;
20754 else
20755 unpack = gen_avx2_sign_extendv8hiv8si2;
20756 halfmode = V8HImode;
20757 extract
20758 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
20759 break;
20760 case V8SImode:
20761 if (unsigned_p)
20762 unpack = gen_avx2_zero_extendv4siv4di2;
20763 else
20764 unpack = gen_avx2_sign_extendv4siv4di2;
20765 halfmode = V4SImode;
20766 extract
20767 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
20768 break;
20769 case V16QImode:
20770 if (unsigned_p)
20771 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
20772 else
20773 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
20774 break;
20775 case V8HImode:
20776 if (unsigned_p)
20777 unpack = gen_sse4_1_zero_extendv4hiv4si2;
20778 else
20779 unpack = gen_sse4_1_sign_extendv4hiv4si2;
20780 break;
20781 case V4SImode:
20782 if (unsigned_p)
20783 unpack = gen_sse4_1_zero_extendv2siv2di2;
20784 else
20785 unpack = gen_sse4_1_sign_extendv2siv2di2;
20786 break;
20787 default:
20788 gcc_unreachable ();
20791 if (GET_MODE_SIZE (imode) == 32)
20793 tmp = gen_reg_rtx (halfmode);
20794 emit_insn (extract (tmp, src));
20796 else if (high_p)
20798 /* Shift higher 8 bytes to lower 8 bytes. */
20799 tmp = gen_reg_rtx (imode);
20800 emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp),
20801 gen_lowpart (V1TImode, src),
20802 GEN_INT (64)));
20804 else
20805 tmp = src;
20807 emit_insn (unpack (dest, tmp));
20809 else
20811 rtx (*unpack)(rtx, rtx, rtx);
20813 switch (imode)
20815 case V16QImode:
20816 if (high_p)
20817 unpack = gen_vec_interleave_highv16qi;
20818 else
20819 unpack = gen_vec_interleave_lowv16qi;
20820 break;
20821 case V8HImode:
20822 if (high_p)
20823 unpack = gen_vec_interleave_highv8hi;
20824 else
20825 unpack = gen_vec_interleave_lowv8hi;
20826 break;
20827 case V4SImode:
20828 if (high_p)
20829 unpack = gen_vec_interleave_highv4si;
20830 else
20831 unpack = gen_vec_interleave_lowv4si;
20832 break;
20833 default:
20834 gcc_unreachable ();
20837 if (unsigned_p)
20838 tmp = force_reg (imode, CONST0_RTX (imode));
20839 else
20840 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
20841 src, pc_rtx, pc_rtx);
20843 emit_insn (unpack (gen_lowpart (imode, dest), src, tmp));
20847 /* Expand conditional increment or decrement using adb/sbb instructions.
20848 The default case using setcc followed by the conditional move can be
20849 done by generic code. */
20850 bool
20851 ix86_expand_int_addcc (rtx operands[])
20853 enum rtx_code code = GET_CODE (operands[1]);
20854 rtx flags;
20855 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
20856 rtx compare_op;
20857 rtx val = const0_rtx;
20858 bool fpcmp = false;
20859 enum machine_mode mode;
20860 rtx op0 = XEXP (operands[1], 0);
20861 rtx op1 = XEXP (operands[1], 1);
20863 if (operands[3] != const1_rtx
20864 && operands[3] != constm1_rtx)
20865 return false;
20866 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
20867 return false;
20868 code = GET_CODE (compare_op);
20870 flags = XEXP (compare_op, 0);
20872 if (GET_MODE (flags) == CCFPmode
20873 || GET_MODE (flags) == CCFPUmode)
20875 fpcmp = true;
20876 code = ix86_fp_compare_code_to_integer (code);
20879 if (code != LTU)
20881 val = constm1_rtx;
20882 if (fpcmp)
20883 PUT_CODE (compare_op,
20884 reverse_condition_maybe_unordered
20885 (GET_CODE (compare_op)));
20886 else
20887 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
20890 mode = GET_MODE (operands[0]);
20892 /* Construct either adc or sbb insn. */
20893 if ((code == LTU) == (operands[3] == constm1_rtx))
20895 switch (mode)
20897 case QImode:
20898 insn = gen_subqi3_carry;
20899 break;
20900 case HImode:
20901 insn = gen_subhi3_carry;
20902 break;
20903 case SImode:
20904 insn = gen_subsi3_carry;
20905 break;
20906 case DImode:
20907 insn = gen_subdi3_carry;
20908 break;
20909 default:
20910 gcc_unreachable ();
20913 else
20915 switch (mode)
20917 case QImode:
20918 insn = gen_addqi3_carry;
20919 break;
20920 case HImode:
20921 insn = gen_addhi3_carry;
20922 break;
20923 case SImode:
20924 insn = gen_addsi3_carry;
20925 break;
20926 case DImode:
20927 insn = gen_adddi3_carry;
20928 break;
20929 default:
20930 gcc_unreachable ();
20933 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
20935 return true;
20939 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
20940 but works for floating pointer parameters and nonoffsetable memories.
20941 For pushes, it returns just stack offsets; the values will be saved
20942 in the right order. Maximally three parts are generated. */
20944 static int
20945 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
20947 int size;
20949 if (!TARGET_64BIT)
20950 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
20951 else
20952 size = (GET_MODE_SIZE (mode) + 4) / 8;
20954 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
20955 gcc_assert (size >= 2 && size <= 4);
20957 /* Optimize constant pool reference to immediates. This is used by fp
20958 moves, that force all constants to memory to allow combining. */
20959 if (MEM_P (operand) && MEM_READONLY_P (operand))
20961 rtx tmp = maybe_get_pool_constant (operand);
20962 if (tmp)
20963 operand = tmp;
20966 if (MEM_P (operand) && !offsettable_memref_p (operand))
20968 /* The only non-offsetable memories we handle are pushes. */
20969 int ok = push_operand (operand, VOIDmode);
20971 gcc_assert (ok);
20973 operand = copy_rtx (operand);
20974 PUT_MODE (operand, word_mode);
20975 parts[0] = parts[1] = parts[2] = parts[3] = operand;
20976 return size;
20979 if (GET_CODE (operand) == CONST_VECTOR)
20981 enum machine_mode imode = int_mode_for_mode (mode);
20982 /* Caution: if we looked through a constant pool memory above,
20983 the operand may actually have a different mode now. That's
20984 ok, since we want to pun this all the way back to an integer. */
20985 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
20986 gcc_assert (operand != NULL);
20987 mode = imode;
20990 if (!TARGET_64BIT)
20992 if (mode == DImode)
20993 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20994 else
20996 int i;
20998 if (REG_P (operand))
21000 gcc_assert (reload_completed);
21001 for (i = 0; i < size; i++)
21002 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
21004 else if (offsettable_memref_p (operand))
21006 operand = adjust_address (operand, SImode, 0);
21007 parts[0] = operand;
21008 for (i = 1; i < size; i++)
21009 parts[i] = adjust_address (operand, SImode, 4 * i);
21011 else if (GET_CODE (operand) == CONST_DOUBLE)
21013 REAL_VALUE_TYPE r;
21014 long l[4];
21016 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
21017 switch (mode)
21019 case TFmode:
21020 real_to_target (l, &r, mode);
21021 parts[3] = gen_int_mode (l[3], SImode);
21022 parts[2] = gen_int_mode (l[2], SImode);
21023 break;
21024 case XFmode:
21025 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
21026 long double may not be 80-bit. */
21027 real_to_target (l, &r, mode);
21028 parts[2] = gen_int_mode (l[2], SImode);
21029 break;
21030 case DFmode:
21031 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
21032 break;
21033 default:
21034 gcc_unreachable ();
21036 parts[1] = gen_int_mode (l[1], SImode);
21037 parts[0] = gen_int_mode (l[0], SImode);
21039 else
21040 gcc_unreachable ();
21043 else
21045 if (mode == TImode)
21046 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
21047 if (mode == XFmode || mode == TFmode)
21049 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
21050 if (REG_P (operand))
21052 gcc_assert (reload_completed);
21053 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
21054 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
21056 else if (offsettable_memref_p (operand))
21058 operand = adjust_address (operand, DImode, 0);
21059 parts[0] = operand;
21060 parts[1] = adjust_address (operand, upper_mode, 8);
21062 else if (GET_CODE (operand) == CONST_DOUBLE)
21064 REAL_VALUE_TYPE r;
21065 long l[4];
21067 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
21068 real_to_target (l, &r, mode);
21070 /* Do not use shift by 32 to avoid warning on 32bit systems. */
21071 if (HOST_BITS_PER_WIDE_INT >= 64)
21072 parts[0]
21073 = gen_int_mode
21074 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
21075 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
21076 DImode);
21077 else
21078 parts[0] = immed_double_const (l[0], l[1], DImode);
21080 if (upper_mode == SImode)
21081 parts[1] = gen_int_mode (l[2], SImode);
21082 else if (HOST_BITS_PER_WIDE_INT >= 64)
21083 parts[1]
21084 = gen_int_mode
21085 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
21086 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
21087 DImode);
21088 else
21089 parts[1] = immed_double_const (l[2], l[3], DImode);
21091 else
21092 gcc_unreachable ();
21096 return size;
21099 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
21100 Return false when normal moves are needed; true when all required
21101 insns have been emitted. Operands 2-4 contain the input values
21102 int the correct order; operands 5-7 contain the output values. */
21104 void
21105 ix86_split_long_move (rtx operands[])
21107 rtx part[2][4];
21108 int nparts, i, j;
21109 int push = 0;
21110 int collisions = 0;
21111 enum machine_mode mode = GET_MODE (operands[0]);
21112 bool collisionparts[4];
21114 /* The DFmode expanders may ask us to move double.
21115 For 64bit target this is single move. By hiding the fact
21116 here we simplify i386.md splitters. */
21117 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
21119 /* Optimize constant pool reference to immediates. This is used by
21120 fp moves, that force all constants to memory to allow combining. */
21122 if (MEM_P (operands[1])
21123 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
21124 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
21125 operands[1] = get_pool_constant (XEXP (operands[1], 0));
21126 if (push_operand (operands[0], VOIDmode))
21128 operands[0] = copy_rtx (operands[0]);
21129 PUT_MODE (operands[0], word_mode);
21131 else
21132 operands[0] = gen_lowpart (DImode, operands[0]);
21133 operands[1] = gen_lowpart (DImode, operands[1]);
21134 emit_move_insn (operands[0], operands[1]);
21135 return;
21138 /* The only non-offsettable memory we handle is push. */
21139 if (push_operand (operands[0], VOIDmode))
21140 push = 1;
21141 else
21142 gcc_assert (!MEM_P (operands[0])
21143 || offsettable_memref_p (operands[0]));
21145 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
21146 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
21148 /* When emitting push, take care for source operands on the stack. */
21149 if (push && MEM_P (operands[1])
21150 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
21152 rtx src_base = XEXP (part[1][nparts - 1], 0);
21154 /* Compensate for the stack decrement by 4. */
21155 if (!TARGET_64BIT && nparts == 3
21156 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
21157 src_base = plus_constant (Pmode, src_base, 4);
21159 /* src_base refers to the stack pointer and is
21160 automatically decreased by emitted push. */
21161 for (i = 0; i < nparts; i++)
21162 part[1][i] = change_address (part[1][i],
21163 GET_MODE (part[1][i]), src_base);
21166 /* We need to do copy in the right order in case an address register
21167 of the source overlaps the destination. */
21168 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
21170 rtx tmp;
21172 for (i = 0; i < nparts; i++)
21174 collisionparts[i]
21175 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
21176 if (collisionparts[i])
21177 collisions++;
21180 /* Collision in the middle part can be handled by reordering. */
21181 if (collisions == 1 && nparts == 3 && collisionparts [1])
21183 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21184 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21186 else if (collisions == 1
21187 && nparts == 4
21188 && (collisionparts [1] || collisionparts [2]))
21190 if (collisionparts [1])
21192 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21193 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21195 else
21197 tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
21198 tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
21202 /* If there are more collisions, we can't handle it by reordering.
21203 Do an lea to the last part and use only one colliding move. */
21204 else if (collisions > 1)
21206 rtx base;
21208 collisions = 1;
21210 base = part[0][nparts - 1];
21212 /* Handle the case when the last part isn't valid for lea.
21213 Happens in 64-bit mode storing the 12-byte XFmode. */
21214 if (GET_MODE (base) != Pmode)
21215 base = gen_rtx_REG (Pmode, REGNO (base));
21217 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
21218 part[1][0] = replace_equiv_address (part[1][0], base);
21219 for (i = 1; i < nparts; i++)
21221 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
21222 part[1][i] = replace_equiv_address (part[1][i], tmp);
21227 if (push)
21229 if (!TARGET_64BIT)
21231 if (nparts == 3)
21233 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
21234 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
21235 stack_pointer_rtx, GEN_INT (-4)));
21236 emit_move_insn (part[0][2], part[1][2]);
21238 else if (nparts == 4)
21240 emit_move_insn (part[0][3], part[1][3]);
21241 emit_move_insn (part[0][2], part[1][2]);
21244 else
21246 /* In 64bit mode we don't have 32bit push available. In case this is
21247 register, it is OK - we will just use larger counterpart. We also
21248 retype memory - these comes from attempt to avoid REX prefix on
21249 moving of second half of TFmode value. */
21250 if (GET_MODE (part[1][1]) == SImode)
21252 switch (GET_CODE (part[1][1]))
21254 case MEM:
21255 part[1][1] = adjust_address (part[1][1], DImode, 0);
21256 break;
21258 case REG:
21259 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
21260 break;
21262 default:
21263 gcc_unreachable ();
21266 if (GET_MODE (part[1][0]) == SImode)
21267 part[1][0] = part[1][1];
21270 emit_move_insn (part[0][1], part[1][1]);
21271 emit_move_insn (part[0][0], part[1][0]);
21272 return;
21275 /* Choose correct order to not overwrite the source before it is copied. */
21276 if ((REG_P (part[0][0])
21277 && REG_P (part[1][1])
21278 && (REGNO (part[0][0]) == REGNO (part[1][1])
21279 || (nparts == 3
21280 && REGNO (part[0][0]) == REGNO (part[1][2]))
21281 || (nparts == 4
21282 && REGNO (part[0][0]) == REGNO (part[1][3]))))
21283 || (collisions > 0
21284 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
21286 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
21288 operands[2 + i] = part[0][j];
21289 operands[6 + i] = part[1][j];
21292 else
21294 for (i = 0; i < nparts; i++)
21296 operands[2 + i] = part[0][i];
21297 operands[6 + i] = part[1][i];
21301 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
21302 if (optimize_insn_for_size_p ())
21304 for (j = 0; j < nparts - 1; j++)
21305 if (CONST_INT_P (operands[6 + j])
21306 && operands[6 + j] != const0_rtx
21307 && REG_P (operands[2 + j]))
21308 for (i = j; i < nparts - 1; i++)
21309 if (CONST_INT_P (operands[7 + i])
21310 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
21311 operands[7 + i] = operands[2 + j];
21314 for (i = 0; i < nparts; i++)
21315 emit_move_insn (operands[2 + i], operands[6 + i]);
21317 return;
21320 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
21321 left shift by a constant, either using a single shift or
21322 a sequence of add instructions. */
21324 static void
21325 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
21327 rtx (*insn)(rtx, rtx, rtx);
21329 if (count == 1
21330 || (count * ix86_cost->add <= ix86_cost->shift_const
21331 && !optimize_insn_for_size_p ()))
21333 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
21334 while (count-- > 0)
21335 emit_insn (insn (operand, operand, operand));
21337 else
21339 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21340 emit_insn (insn (operand, operand, GEN_INT (count)));
21344 void
21345 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
21347 rtx (*gen_ashl3)(rtx, rtx, rtx);
21348 rtx (*gen_shld)(rtx, rtx, rtx);
21349 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21351 rtx low[2], high[2];
21352 int count;
21354 if (CONST_INT_P (operands[2]))
21356 split_double_mode (mode, operands, 2, low, high);
21357 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21359 if (count >= half_width)
21361 emit_move_insn (high[0], low[1]);
21362 emit_move_insn (low[0], const0_rtx);
21364 if (count > half_width)
21365 ix86_expand_ashl_const (high[0], count - half_width, mode);
21367 else
21369 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21371 if (!rtx_equal_p (operands[0], operands[1]))
21372 emit_move_insn (operands[0], operands[1]);
21374 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
21375 ix86_expand_ashl_const (low[0], count, mode);
21377 return;
21380 split_double_mode (mode, operands, 1, low, high);
21382 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21384 if (operands[1] == const1_rtx)
21386 /* Assuming we've chosen a QImode capable registers, then 1 << N
21387 can be done with two 32/64-bit shifts, no branches, no cmoves. */
21388 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
21390 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
21392 ix86_expand_clear (low[0]);
21393 ix86_expand_clear (high[0]);
21394 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
21396 d = gen_lowpart (QImode, low[0]);
21397 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21398 s = gen_rtx_EQ (QImode, flags, const0_rtx);
21399 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21401 d = gen_lowpart (QImode, high[0]);
21402 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21403 s = gen_rtx_NE (QImode, flags, const0_rtx);
21404 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21407 /* Otherwise, we can get the same results by manually performing
21408 a bit extract operation on bit 5/6, and then performing the two
21409 shifts. The two methods of getting 0/1 into low/high are exactly
21410 the same size. Avoiding the shift in the bit extract case helps
21411 pentium4 a bit; no one else seems to care much either way. */
21412 else
21414 enum machine_mode half_mode;
21415 rtx (*gen_lshr3)(rtx, rtx, rtx);
21416 rtx (*gen_and3)(rtx, rtx, rtx);
21417 rtx (*gen_xor3)(rtx, rtx, rtx);
21418 HOST_WIDE_INT bits;
21419 rtx x;
21421 if (mode == DImode)
21423 half_mode = SImode;
21424 gen_lshr3 = gen_lshrsi3;
21425 gen_and3 = gen_andsi3;
21426 gen_xor3 = gen_xorsi3;
21427 bits = 5;
21429 else
21431 half_mode = DImode;
21432 gen_lshr3 = gen_lshrdi3;
21433 gen_and3 = gen_anddi3;
21434 gen_xor3 = gen_xordi3;
21435 bits = 6;
21438 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
21439 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
21440 else
21441 x = gen_lowpart (half_mode, operands[2]);
21442 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
21444 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
21445 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
21446 emit_move_insn (low[0], high[0]);
21447 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
21450 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21451 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
21452 return;
21455 if (operands[1] == constm1_rtx)
21457 /* For -1 << N, we can avoid the shld instruction, because we
21458 know that we're shifting 0...31/63 ones into a -1. */
21459 emit_move_insn (low[0], constm1_rtx);
21460 if (optimize_insn_for_size_p ())
21461 emit_move_insn (high[0], low[0]);
21462 else
21463 emit_move_insn (high[0], constm1_rtx);
21465 else
21467 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21469 if (!rtx_equal_p (operands[0], operands[1]))
21470 emit_move_insn (operands[0], operands[1]);
21472 split_double_mode (mode, operands, 1, low, high);
21473 emit_insn (gen_shld (high[0], low[0], operands[2]));
21476 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21478 if (TARGET_CMOVE && scratch)
21480 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21481 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21483 ix86_expand_clear (scratch);
21484 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
21486 else
21488 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21489 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21491 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
21495 void
21496 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
21498 rtx (*gen_ashr3)(rtx, rtx, rtx)
21499 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
21500 rtx (*gen_shrd)(rtx, rtx, rtx);
21501 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21503 rtx low[2], high[2];
21504 int count;
21506 if (CONST_INT_P (operands[2]))
21508 split_double_mode (mode, operands, 2, low, high);
21509 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21511 if (count == GET_MODE_BITSIZE (mode) - 1)
21513 emit_move_insn (high[0], high[1]);
21514 emit_insn (gen_ashr3 (high[0], high[0],
21515 GEN_INT (half_width - 1)));
21516 emit_move_insn (low[0], high[0]);
21519 else if (count >= half_width)
21521 emit_move_insn (low[0], high[1]);
21522 emit_move_insn (high[0], low[0]);
21523 emit_insn (gen_ashr3 (high[0], high[0],
21524 GEN_INT (half_width - 1)));
21526 if (count > half_width)
21527 emit_insn (gen_ashr3 (low[0], low[0],
21528 GEN_INT (count - half_width)));
21530 else
21532 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21534 if (!rtx_equal_p (operands[0], operands[1]))
21535 emit_move_insn (operands[0], operands[1]);
21537 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21538 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
21541 else
21543 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21545 if (!rtx_equal_p (operands[0], operands[1]))
21546 emit_move_insn (operands[0], operands[1]);
21548 split_double_mode (mode, operands, 1, low, high);
21550 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21551 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
21553 if (TARGET_CMOVE && scratch)
21555 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21556 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21558 emit_move_insn (scratch, high[0]);
21559 emit_insn (gen_ashr3 (scratch, scratch,
21560 GEN_INT (half_width - 1)));
21561 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21562 scratch));
21564 else
21566 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
21567 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
21569 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
21574 void
21575 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
21577 rtx (*gen_lshr3)(rtx, rtx, rtx)
21578 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
21579 rtx (*gen_shrd)(rtx, rtx, rtx);
21580 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21582 rtx low[2], high[2];
21583 int count;
21585 if (CONST_INT_P (operands[2]))
21587 split_double_mode (mode, operands, 2, low, high);
21588 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21590 if (count >= half_width)
21592 emit_move_insn (low[0], high[1]);
21593 ix86_expand_clear (high[0]);
21595 if (count > half_width)
21596 emit_insn (gen_lshr3 (low[0], low[0],
21597 GEN_INT (count - half_width)));
21599 else
21601 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21603 if (!rtx_equal_p (operands[0], operands[1]))
21604 emit_move_insn (operands[0], operands[1]);
21606 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21607 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
21610 else
21612 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21614 if (!rtx_equal_p (operands[0], operands[1]))
21615 emit_move_insn (operands[0], operands[1]);
21617 split_double_mode (mode, operands, 1, low, high);
21619 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21620 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
21622 if (TARGET_CMOVE && scratch)
21624 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21625 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21627 ix86_expand_clear (scratch);
21628 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21629 scratch));
21631 else
21633 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21634 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21636 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
21641 /* Predict just emitted jump instruction to be taken with probability PROB. */
21642 static void
21643 predict_jump (int prob)
21645 rtx insn = get_last_insn ();
21646 gcc_assert (JUMP_P (insn));
21647 add_reg_note (insn, REG_BR_PROB, GEN_INT (prob));
21650 /* Helper function for the string operations below. Dest VARIABLE whether
21651 it is aligned to VALUE bytes. If true, jump to the label. */
21652 static rtx
21653 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
21655 rtx label = gen_label_rtx ();
21656 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
21657 if (GET_MODE (variable) == DImode)
21658 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
21659 else
21660 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
21661 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
21662 1, label);
21663 if (epilogue)
21664 predict_jump (REG_BR_PROB_BASE * 50 / 100);
21665 else
21666 predict_jump (REG_BR_PROB_BASE * 90 / 100);
21667 return label;
21670 /* Adjust COUNTER by the VALUE. */
21671 static void
21672 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
21674 rtx (*gen_add)(rtx, rtx, rtx)
21675 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
21677 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
21680 /* Zero extend possibly SImode EXP to Pmode register. */
21682 ix86_zero_extend_to_Pmode (rtx exp)
21684 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
21687 /* Divide COUNTREG by SCALE. */
21688 static rtx
21689 scale_counter (rtx countreg, int scale)
21691 rtx sc;
21693 if (scale == 1)
21694 return countreg;
21695 if (CONST_INT_P (countreg))
21696 return GEN_INT (INTVAL (countreg) / scale);
21697 gcc_assert (REG_P (countreg));
21699 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
21700 GEN_INT (exact_log2 (scale)),
21701 NULL, 1, OPTAB_DIRECT);
21702 return sc;
21705 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
21706 DImode for constant loop counts. */
21708 static enum machine_mode
21709 counter_mode (rtx count_exp)
21711 if (GET_MODE (count_exp) != VOIDmode)
21712 return GET_MODE (count_exp);
21713 if (!CONST_INT_P (count_exp))
21714 return Pmode;
21715 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
21716 return DImode;
21717 return SImode;
21720 /* When SRCPTR is non-NULL, output simple loop to move memory
21721 pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
21722 overall size is COUNT specified in bytes. When SRCPTR is NULL, output the
21723 equivalent loop to set memory by VALUE (supposed to be in MODE).
21725 The size is rounded down to whole number of chunk size moved at once.
21726 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
21729 static void
21730 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
21731 rtx destptr, rtx srcptr, rtx value,
21732 rtx count, enum machine_mode mode, int unroll,
21733 int expected_size)
21735 rtx out_label, top_label, iter, tmp;
21736 enum machine_mode iter_mode = counter_mode (count);
21737 rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
21738 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
21739 rtx size;
21740 rtx x_addr;
21741 rtx y_addr;
21742 int i;
21744 top_label = gen_label_rtx ();
21745 out_label = gen_label_rtx ();
21746 iter = gen_reg_rtx (iter_mode);
21748 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
21749 NULL, 1, OPTAB_DIRECT);
21750 /* Those two should combine. */
21751 if (piece_size == const1_rtx)
21753 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
21754 true, out_label);
21755 predict_jump (REG_BR_PROB_BASE * 10 / 100);
21757 emit_move_insn (iter, const0_rtx);
21759 emit_label (top_label);
21761 tmp = convert_modes (Pmode, iter_mode, iter, true);
21762 x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
21763 destmem = change_address (destmem, mode, x_addr);
21765 if (srcmem)
21767 y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
21768 srcmem = change_address (srcmem, mode, y_addr);
21770 /* When unrolling for chips that reorder memory reads and writes,
21771 we can save registers by using single temporary.
21772 Also using 4 temporaries is overkill in 32bit mode. */
21773 if (!TARGET_64BIT && 0)
21775 for (i = 0; i < unroll; i++)
21777 if (i)
21779 destmem =
21780 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21781 srcmem =
21782 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21784 emit_move_insn (destmem, srcmem);
21787 else
21789 rtx tmpreg[4];
21790 gcc_assert (unroll <= 4);
21791 for (i = 0; i < unroll; i++)
21793 tmpreg[i] = gen_reg_rtx (mode);
21794 if (i)
21796 srcmem =
21797 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21799 emit_move_insn (tmpreg[i], srcmem);
21801 for (i = 0; i < unroll; i++)
21803 if (i)
21805 destmem =
21806 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21808 emit_move_insn (destmem, tmpreg[i]);
21812 else
21813 for (i = 0; i < unroll; i++)
21815 if (i)
21816 destmem =
21817 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21818 emit_move_insn (destmem, value);
21821 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
21822 true, OPTAB_LIB_WIDEN);
21823 if (tmp != iter)
21824 emit_move_insn (iter, tmp);
21826 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
21827 true, top_label);
21828 if (expected_size != -1)
21830 expected_size /= GET_MODE_SIZE (mode) * unroll;
21831 if (expected_size == 0)
21832 predict_jump (0);
21833 else if (expected_size > REG_BR_PROB_BASE)
21834 predict_jump (REG_BR_PROB_BASE - 1);
21835 else
21836 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
21838 else
21839 predict_jump (REG_BR_PROB_BASE * 80 / 100);
21840 iter = ix86_zero_extend_to_Pmode (iter);
21841 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
21842 true, OPTAB_LIB_WIDEN);
21843 if (tmp != destptr)
21844 emit_move_insn (destptr, tmp);
21845 if (srcptr)
21847 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
21848 true, OPTAB_LIB_WIDEN);
21849 if (tmp != srcptr)
21850 emit_move_insn (srcptr, tmp);
21852 emit_label (out_label);
21855 /* Output "rep; mov" instruction.
21856 Arguments have same meaning as for previous function */
21857 static void
21858 expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
21859 rtx destptr, rtx srcptr,
21860 rtx count,
21861 enum machine_mode mode)
21863 rtx destexp;
21864 rtx srcexp;
21865 rtx countreg;
21866 HOST_WIDE_INT rounded_count;
21868 /* If the size is known, it is shorter to use rep movs. */
21869 if (mode == QImode && CONST_INT_P (count)
21870 && !(INTVAL (count) & 3))
21871 mode = SImode;
21873 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21874 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21875 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
21876 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
21877 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21878 if (mode != QImode)
21880 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21881 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21882 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21883 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
21884 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21885 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
21887 else
21889 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21890 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
21892 if (CONST_INT_P (count))
21894 rounded_count = (INTVAL (count)
21895 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21896 destmem = shallow_copy_rtx (destmem);
21897 srcmem = shallow_copy_rtx (srcmem);
21898 set_mem_size (destmem, rounded_count);
21899 set_mem_size (srcmem, rounded_count);
21901 else
21903 if (MEM_SIZE_KNOWN_P (destmem))
21904 clear_mem_size (destmem);
21905 if (MEM_SIZE_KNOWN_P (srcmem))
21906 clear_mem_size (srcmem);
21908 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
21909 destexp, srcexp));
21912 /* Output "rep; stos" instruction.
21913 Arguments have same meaning as for previous function */
21914 static void
21915 expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
21916 rtx count, enum machine_mode mode,
21917 rtx orig_value)
21919 rtx destexp;
21920 rtx countreg;
21921 HOST_WIDE_INT rounded_count;
21923 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21924 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21925 value = force_reg (mode, gen_lowpart (mode, value));
21926 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21927 if (mode != QImode)
21929 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21930 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21931 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21933 else
21934 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21935 if (orig_value == const0_rtx && CONST_INT_P (count))
21937 rounded_count = (INTVAL (count)
21938 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21939 destmem = shallow_copy_rtx (destmem);
21940 set_mem_size (destmem, rounded_count);
21942 else if (MEM_SIZE_KNOWN_P (destmem))
21943 clear_mem_size (destmem);
21944 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
21947 static void
21948 emit_strmov (rtx destmem, rtx srcmem,
21949 rtx destptr, rtx srcptr, enum machine_mode mode, int offset)
21951 rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset);
21952 rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
21953 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21956 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
21957 static void
21958 expand_movmem_epilogue (rtx destmem, rtx srcmem,
21959 rtx destptr, rtx srcptr, rtx count, int max_size)
21961 rtx src, dest;
21962 if (CONST_INT_P (count))
21964 HOST_WIDE_INT countval = INTVAL (count);
21965 int offset = 0;
21967 if ((countval & 0x10) && max_size > 16)
21969 if (TARGET_64BIT)
21971 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21972 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
21974 else
21975 gcc_unreachable ();
21976 offset += 16;
21978 if ((countval & 0x08) && max_size > 8)
21980 if (TARGET_64BIT)
21981 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21982 else
21984 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21985 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset + 4);
21987 offset += 8;
21989 if ((countval & 0x04) && max_size > 4)
21991 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21992 offset += 4;
21994 if ((countval & 0x02) && max_size > 2)
21996 emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
21997 offset += 2;
21999 if ((countval & 0x01) && max_size > 1)
22001 emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
22002 offset += 1;
22004 return;
22006 if (max_size > 8)
22008 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
22009 count, 1, OPTAB_DIRECT);
22010 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
22011 count, QImode, 1, 4);
22012 return;
22015 /* When there are stringops, we can cheaply increase dest and src pointers.
22016 Otherwise we save code size by maintaining offset (zero is readily
22017 available from preceding rep operation) and using x86 addressing modes.
22019 if (TARGET_SINGLE_STRINGOP)
22021 if (max_size > 4)
22023 rtx label = ix86_expand_aligntest (count, 4, true);
22024 src = change_address (srcmem, SImode, srcptr);
22025 dest = change_address (destmem, SImode, destptr);
22026 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22027 emit_label (label);
22028 LABEL_NUSES (label) = 1;
22030 if (max_size > 2)
22032 rtx label = ix86_expand_aligntest (count, 2, true);
22033 src = change_address (srcmem, HImode, srcptr);
22034 dest = change_address (destmem, HImode, destptr);
22035 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22036 emit_label (label);
22037 LABEL_NUSES (label) = 1;
22039 if (max_size > 1)
22041 rtx label = ix86_expand_aligntest (count, 1, true);
22042 src = change_address (srcmem, QImode, srcptr);
22043 dest = change_address (destmem, QImode, destptr);
22044 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22045 emit_label (label);
22046 LABEL_NUSES (label) = 1;
22049 else
22051 rtx offset = force_reg (Pmode, const0_rtx);
22052 rtx tmp;
22054 if (max_size > 4)
22056 rtx label = ix86_expand_aligntest (count, 4, true);
22057 src = change_address (srcmem, SImode, srcptr);
22058 dest = change_address (destmem, SImode, destptr);
22059 emit_move_insn (dest, src);
22060 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
22061 true, OPTAB_LIB_WIDEN);
22062 if (tmp != offset)
22063 emit_move_insn (offset, tmp);
22064 emit_label (label);
22065 LABEL_NUSES (label) = 1;
22067 if (max_size > 2)
22069 rtx label = ix86_expand_aligntest (count, 2, true);
22070 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22071 src = change_address (srcmem, HImode, tmp);
22072 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22073 dest = change_address (destmem, HImode, tmp);
22074 emit_move_insn (dest, src);
22075 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
22076 true, OPTAB_LIB_WIDEN);
22077 if (tmp != offset)
22078 emit_move_insn (offset, tmp);
22079 emit_label (label);
22080 LABEL_NUSES (label) = 1;
22082 if (max_size > 1)
22084 rtx label = ix86_expand_aligntest (count, 1, true);
22085 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22086 src = change_address (srcmem, QImode, tmp);
22087 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22088 dest = change_address (destmem, QImode, tmp);
22089 emit_move_insn (dest, src);
22090 emit_label (label);
22091 LABEL_NUSES (label) = 1;
22096 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22097 static void
22098 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
22099 rtx count, int max_size)
22101 count =
22102 expand_simple_binop (counter_mode (count), AND, count,
22103 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
22104 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
22105 gen_lowpart (QImode, value), count, QImode,
22106 1, max_size / 2);
22109 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22110 static void
22111 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size)
22113 rtx dest;
22115 if (CONST_INT_P (count))
22117 HOST_WIDE_INT countval = INTVAL (count);
22118 int offset = 0;
22120 if ((countval & 0x10) && max_size > 16)
22122 if (TARGET_64BIT)
22124 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
22125 emit_insn (gen_strset (destptr, dest, value));
22126 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8);
22127 emit_insn (gen_strset (destptr, dest, value));
22129 else
22130 gcc_unreachable ();
22131 offset += 16;
22133 if ((countval & 0x08) && max_size > 8)
22135 if (TARGET_64BIT)
22137 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
22138 emit_insn (gen_strset (destptr, dest, value));
22140 else
22142 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
22143 emit_insn (gen_strset (destptr, dest, value));
22144 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4);
22145 emit_insn (gen_strset (destptr, dest, value));
22147 offset += 8;
22149 if ((countval & 0x04) && max_size > 4)
22151 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
22152 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
22153 offset += 4;
22155 if ((countval & 0x02) && max_size > 2)
22157 dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset);
22158 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
22159 offset += 2;
22161 if ((countval & 0x01) && max_size > 1)
22163 dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset);
22164 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
22165 offset += 1;
22167 return;
22169 if (max_size > 32)
22171 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
22172 return;
22174 if (max_size > 16)
22176 rtx label = ix86_expand_aligntest (count, 16, true);
22177 if (TARGET_64BIT)
22179 dest = change_address (destmem, DImode, destptr);
22180 emit_insn (gen_strset (destptr, dest, value));
22181 emit_insn (gen_strset (destptr, dest, value));
22183 else
22185 dest = change_address (destmem, SImode, destptr);
22186 emit_insn (gen_strset (destptr, dest, value));
22187 emit_insn (gen_strset (destptr, dest, value));
22188 emit_insn (gen_strset (destptr, dest, value));
22189 emit_insn (gen_strset (destptr, dest, value));
22191 emit_label (label);
22192 LABEL_NUSES (label) = 1;
22194 if (max_size > 8)
22196 rtx label = ix86_expand_aligntest (count, 8, true);
22197 if (TARGET_64BIT)
22199 dest = change_address (destmem, DImode, destptr);
22200 emit_insn (gen_strset (destptr, dest, value));
22202 else
22204 dest = change_address (destmem, SImode, destptr);
22205 emit_insn (gen_strset (destptr, dest, value));
22206 emit_insn (gen_strset (destptr, dest, value));
22208 emit_label (label);
22209 LABEL_NUSES (label) = 1;
22211 if (max_size > 4)
22213 rtx label = ix86_expand_aligntest (count, 4, true);
22214 dest = change_address (destmem, SImode, destptr);
22215 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
22216 emit_label (label);
22217 LABEL_NUSES (label) = 1;
22219 if (max_size > 2)
22221 rtx label = ix86_expand_aligntest (count, 2, true);
22222 dest = change_address (destmem, HImode, destptr);
22223 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
22224 emit_label (label);
22225 LABEL_NUSES (label) = 1;
22227 if (max_size > 1)
22229 rtx label = ix86_expand_aligntest (count, 1, true);
22230 dest = change_address (destmem, QImode, destptr);
22231 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
22232 emit_label (label);
22233 LABEL_NUSES (label) = 1;
22237 /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
22238 DESIRED_ALIGNMENT. */
22239 static void
22240 expand_movmem_prologue (rtx destmem, rtx srcmem,
22241 rtx destptr, rtx srcptr, rtx count,
22242 int align, int desired_alignment)
22244 if (align <= 1 && desired_alignment > 1)
22246 rtx label = ix86_expand_aligntest (destptr, 1, false);
22247 srcmem = change_address (srcmem, QImode, srcptr);
22248 destmem = change_address (destmem, QImode, destptr);
22249 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
22250 ix86_adjust_counter (count, 1);
22251 emit_label (label);
22252 LABEL_NUSES (label) = 1;
22254 if (align <= 2 && desired_alignment > 2)
22256 rtx label = ix86_expand_aligntest (destptr, 2, false);
22257 srcmem = change_address (srcmem, HImode, srcptr);
22258 destmem = change_address (destmem, HImode, destptr);
22259 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
22260 ix86_adjust_counter (count, 2);
22261 emit_label (label);
22262 LABEL_NUSES (label) = 1;
22264 if (align <= 4 && desired_alignment > 4)
22266 rtx label = ix86_expand_aligntest (destptr, 4, false);
22267 srcmem = change_address (srcmem, SImode, srcptr);
22268 destmem = change_address (destmem, SImode, destptr);
22269 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
22270 ix86_adjust_counter (count, 4);
22271 emit_label (label);
22272 LABEL_NUSES (label) = 1;
22274 gcc_assert (desired_alignment <= 8);
22277 /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
22278 ALIGN_BYTES is how many bytes need to be copied. */
22279 static rtx
22280 expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
22281 int desired_align, int align_bytes)
22283 rtx src = *srcp;
22284 rtx orig_dst = dst;
22285 rtx orig_src = src;
22286 int off = 0;
22287 int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
22288 if (src_align_bytes >= 0)
22289 src_align_bytes = desired_align - src_align_bytes;
22290 if (align_bytes & 1)
22292 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
22293 src = adjust_automodify_address_nv (src, QImode, srcreg, 0);
22294 off = 1;
22295 emit_insn (gen_strmov (destreg, dst, srcreg, src));
22297 if (align_bytes & 2)
22299 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
22300 src = adjust_automodify_address_nv (src, HImode, srcreg, off);
22301 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
22302 set_mem_align (dst, 2 * BITS_PER_UNIT);
22303 if (src_align_bytes >= 0
22304 && (src_align_bytes & 1) == (align_bytes & 1)
22305 && MEM_ALIGN (src) < 2 * BITS_PER_UNIT)
22306 set_mem_align (src, 2 * BITS_PER_UNIT);
22307 off = 2;
22308 emit_insn (gen_strmov (destreg, dst, srcreg, src));
22310 if (align_bytes & 4)
22312 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
22313 src = adjust_automodify_address_nv (src, SImode, srcreg, off);
22314 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
22315 set_mem_align (dst, 4 * BITS_PER_UNIT);
22316 if (src_align_bytes >= 0)
22318 unsigned int src_align = 0;
22319 if ((src_align_bytes & 3) == (align_bytes & 3))
22320 src_align = 4;
22321 else if ((src_align_bytes & 1) == (align_bytes & 1))
22322 src_align = 2;
22323 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
22324 set_mem_align (src, src_align * BITS_PER_UNIT);
22326 off = 4;
22327 emit_insn (gen_strmov (destreg, dst, srcreg, src));
22329 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
22330 src = adjust_automodify_address_nv (src, BLKmode, srcreg, off);
22331 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
22332 set_mem_align (dst, desired_align * BITS_PER_UNIT);
22333 if (src_align_bytes >= 0)
22335 unsigned int src_align = 0;
22336 if ((src_align_bytes & 7) == (align_bytes & 7))
22337 src_align = 8;
22338 else if ((src_align_bytes & 3) == (align_bytes & 3))
22339 src_align = 4;
22340 else if ((src_align_bytes & 1) == (align_bytes & 1))
22341 src_align = 2;
22342 if (src_align > (unsigned int) desired_align)
22343 src_align = desired_align;
22344 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
22345 set_mem_align (src, src_align * BITS_PER_UNIT);
22347 if (MEM_SIZE_KNOWN_P (orig_dst))
22348 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
22349 if (MEM_SIZE_KNOWN_P (orig_src))
22350 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
22351 *srcp = src;
22352 return dst;
22355 /* Set enough from DEST to align DEST known to by aligned by ALIGN to
22356 DESIRED_ALIGNMENT. */
22357 static void
22358 expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
22359 int align, int desired_alignment)
22361 if (align <= 1 && desired_alignment > 1)
22363 rtx label = ix86_expand_aligntest (destptr, 1, false);
22364 destmem = change_address (destmem, QImode, destptr);
22365 emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
22366 ix86_adjust_counter (count, 1);
22367 emit_label (label);
22368 LABEL_NUSES (label) = 1;
22370 if (align <= 2 && desired_alignment > 2)
22372 rtx label = ix86_expand_aligntest (destptr, 2, false);
22373 destmem = change_address (destmem, HImode, destptr);
22374 emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
22375 ix86_adjust_counter (count, 2);
22376 emit_label (label);
22377 LABEL_NUSES (label) = 1;
22379 if (align <= 4 && desired_alignment > 4)
22381 rtx label = ix86_expand_aligntest (destptr, 4, false);
22382 destmem = change_address (destmem, SImode, destptr);
22383 emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
22384 ix86_adjust_counter (count, 4);
22385 emit_label (label);
22386 LABEL_NUSES (label) = 1;
22388 gcc_assert (desired_alignment <= 8);
22391 /* Set enough from DST to align DST known to by aligned by ALIGN to
22392 DESIRED_ALIGN. ALIGN_BYTES is how many bytes need to be stored. */
22393 static rtx
22394 expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
22395 int desired_align, int align_bytes)
22397 int off = 0;
22398 rtx orig_dst = dst;
22399 if (align_bytes & 1)
22401 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
22402 off = 1;
22403 emit_insn (gen_strset (destreg, dst,
22404 gen_lowpart (QImode, value)));
22406 if (align_bytes & 2)
22408 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
22409 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
22410 set_mem_align (dst, 2 * BITS_PER_UNIT);
22411 off = 2;
22412 emit_insn (gen_strset (destreg, dst,
22413 gen_lowpart (HImode, value)));
22415 if (align_bytes & 4)
22417 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
22418 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
22419 set_mem_align (dst, 4 * BITS_PER_UNIT);
22420 off = 4;
22421 emit_insn (gen_strset (destreg, dst,
22422 gen_lowpart (SImode, value)));
22424 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
22425 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
22426 set_mem_align (dst, desired_align * BITS_PER_UNIT);
22427 if (MEM_SIZE_KNOWN_P (orig_dst))
22428 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
22429 return dst;
22432 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
22433 static enum stringop_alg
22434 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
22435 int *dynamic_check, bool *noalign)
22437 const struct stringop_algs * algs;
22438 bool optimize_for_speed;
22439 /* Algorithms using the rep prefix want at least edi and ecx;
22440 additionally, memset wants eax and memcpy wants esi. Don't
22441 consider such algorithms if the user has appropriated those
22442 registers for their own purposes. */
22443 bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
22444 || (memset
22445 ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
22446 *noalign = false;
22448 #define ALG_USABLE_P(alg) (rep_prefix_usable \
22449 || (alg != rep_prefix_1_byte \
22450 && alg != rep_prefix_4_byte \
22451 && alg != rep_prefix_8_byte))
22452 const struct processor_costs *cost;
22454 /* Even if the string operation call is cold, we still might spend a lot
22455 of time processing large blocks. */
22456 if (optimize_function_for_size_p (cfun)
22457 || (optimize_insn_for_size_p ()
22458 && expected_size != -1 && expected_size < 256))
22459 optimize_for_speed = false;
22460 else
22461 optimize_for_speed = true;
22463 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
22465 *dynamic_check = -1;
22466 if (memset)
22467 algs = &cost->memset[TARGET_64BIT != 0];
22468 else
22469 algs = &cost->memcpy[TARGET_64BIT != 0];
22470 if (ix86_stringop_alg != no_stringop && ALG_USABLE_P (ix86_stringop_alg))
22471 return ix86_stringop_alg;
22472 /* rep; movq or rep; movl is the smallest variant. */
22473 else if (!optimize_for_speed)
22475 if (!count || (count & 3))
22476 return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
22477 else
22478 return rep_prefix_usable ? rep_prefix_4_byte : loop;
22480 /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
22482 else if (expected_size != -1 && expected_size < 4)
22483 return loop_1_byte;
22484 else if (expected_size != -1)
22486 unsigned int i;
22487 enum stringop_alg alg = libcall;
22488 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22490 /* We get here if the algorithms that were not libcall-based
22491 were rep-prefix based and we are unable to use rep prefixes
22492 based on global register usage. Break out of the loop and
22493 use the heuristic below. */
22494 if (algs->size[i].max == 0)
22495 break;
22496 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
22498 enum stringop_alg candidate = algs->size[i].alg;
22500 if (candidate != libcall && ALG_USABLE_P (candidate))
22501 alg = candidate;
22502 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
22503 last non-libcall inline algorithm. */
22504 if (TARGET_INLINE_ALL_STRINGOPS)
22506 /* When the current size is best to be copied by a libcall,
22507 but we are still forced to inline, run the heuristic below
22508 that will pick code for medium sized blocks. */
22509 if (alg != libcall)
22510 return alg;
22511 break;
22513 else if (ALG_USABLE_P (candidate))
22515 *noalign = algs->size[i].noalign;
22516 return candidate;
22520 gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
22522 /* When asked to inline the call anyway, try to pick meaningful choice.
22523 We look for maximal size of block that is faster to copy by hand and
22524 take blocks of at most of that size guessing that average size will
22525 be roughly half of the block.
22527 If this turns out to be bad, we might simply specify the preferred
22528 choice in ix86_costs. */
22529 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22530 && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size)))
22532 int max = -1;
22533 enum stringop_alg alg;
22534 int i;
22535 bool any_alg_usable_p = true;
22537 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22539 enum stringop_alg candidate = algs->size[i].alg;
22540 any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
22542 if (candidate != libcall && candidate
22543 && ALG_USABLE_P (candidate))
22544 max = algs->size[i].max;
22546 /* If there aren't any usable algorithms, then recursing on
22547 smaller sizes isn't going to find anything. Just return the
22548 simple byte-at-a-time copy loop. */
22549 if (!any_alg_usable_p)
22551 /* Pick something reasonable. */
22552 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22553 *dynamic_check = 128;
22554 return loop_1_byte;
22556 if (max == -1)
22557 max = 4096;
22558 alg = decide_alg (count, max / 2, memset, dynamic_check, noalign);
22559 gcc_assert (*dynamic_check == -1);
22560 gcc_assert (alg != libcall);
22561 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22562 *dynamic_check = max;
22563 return alg;
22565 return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
22566 #undef ALG_USABLE_P
22569 /* Decide on alignment. We know that the operand is already aligned to ALIGN
22570 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
22571 static int
22572 decide_alignment (int align,
22573 enum stringop_alg alg,
22574 int expected_size)
22576 int desired_align = 0;
22577 switch (alg)
22579 case no_stringop:
22580 gcc_unreachable ();
22581 case loop:
22582 case unrolled_loop:
22583 desired_align = GET_MODE_SIZE (Pmode);
22584 break;
22585 case rep_prefix_8_byte:
22586 desired_align = 8;
22587 break;
22588 case rep_prefix_4_byte:
22589 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22590 copying whole cacheline at once. */
22591 if (TARGET_PENTIUMPRO)
22592 desired_align = 8;
22593 else
22594 desired_align = 4;
22595 break;
22596 case rep_prefix_1_byte:
22597 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22598 copying whole cacheline at once. */
22599 if (TARGET_PENTIUMPRO)
22600 desired_align = 8;
22601 else
22602 desired_align = 1;
22603 break;
22604 case loop_1_byte:
22605 desired_align = 1;
22606 break;
22607 case libcall:
22608 return 0;
22611 if (optimize_size)
22612 desired_align = 1;
22613 if (desired_align < align)
22614 desired_align = align;
22615 if (expected_size != -1 && expected_size < 4)
22616 desired_align = align;
22617 return desired_align;
22620 /* Return the smallest power of 2 greater than VAL. */
22621 static int
22622 smallest_pow2_greater_than (int val)
22624 int ret = 1;
22625 while (ret <= val)
22626 ret <<= 1;
22627 return ret;
22630 /* Expand string move (memcpy) operation. Use i386 string operations
22631 when profitable. expand_setmem contains similar code. The code
22632 depends upon architecture, block size and alignment, but always has
22633 the same overall structure:
22635 1) Prologue guard: Conditional that jumps up to epilogues for small
22636 blocks that can be handled by epilogue alone. This is faster
22637 but also needed for correctness, since prologue assume the block
22638 is larger than the desired alignment.
22640 Optional dynamic check for size and libcall for large
22641 blocks is emitted here too, with -minline-stringops-dynamically.
22643 2) Prologue: copy first few bytes in order to get destination
22644 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
22645 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
22646 copied. We emit either a jump tree on power of two sized
22647 blocks, or a byte loop.
22649 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
22650 with specified algorithm.
22652 4) Epilogue: code copying tail of the block that is too small to be
22653 handled by main body (or up to size guarded by prologue guard). */
22655 bool
22656 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
22657 rtx expected_align_exp, rtx expected_size_exp)
22659 rtx destreg;
22660 rtx srcreg;
22661 rtx label = NULL;
22662 rtx tmp;
22663 rtx jump_around_label = NULL;
22664 HOST_WIDE_INT align = 1;
22665 unsigned HOST_WIDE_INT count = 0;
22666 HOST_WIDE_INT expected_size = -1;
22667 int size_needed = 0, epilogue_size_needed;
22668 int desired_align = 0, align_bytes = 0;
22669 enum stringop_alg alg;
22670 int dynamic_check;
22671 bool need_zero_guard = false;
22672 bool noalign;
22674 if (CONST_INT_P (align_exp))
22675 align = INTVAL (align_exp);
22676 /* i386 can do misaligned access on reasonably increased cost. */
22677 if (CONST_INT_P (expected_align_exp)
22678 && INTVAL (expected_align_exp) > align)
22679 align = INTVAL (expected_align_exp);
22680 /* ALIGN is the minimum of destination and source alignment, but we care here
22681 just about destination alignment. */
22682 else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
22683 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
22685 if (CONST_INT_P (count_exp))
22686 count = expected_size = INTVAL (count_exp);
22687 if (CONST_INT_P (expected_size_exp) && count == 0)
22688 expected_size = INTVAL (expected_size_exp);
22690 /* Make sure we don't need to care about overflow later on. */
22691 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22692 return false;
22694 /* Step 0: Decide on preferred algorithm, desired alignment and
22695 size of chunks to be copied by main loop. */
22697 alg = decide_alg (count, expected_size, false, &dynamic_check, &noalign);
22698 desired_align = decide_alignment (align, alg, expected_size);
22700 if (!TARGET_ALIGN_STRINGOPS || noalign)
22701 align = desired_align;
22703 if (alg == libcall)
22704 return false;
22705 gcc_assert (alg != no_stringop);
22706 if (!count)
22707 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
22708 destreg = copy_addr_to_reg (XEXP (dst, 0));
22709 srcreg = copy_addr_to_reg (XEXP (src, 0));
22710 switch (alg)
22712 case libcall:
22713 case no_stringop:
22714 gcc_unreachable ();
22715 case loop:
22716 need_zero_guard = true;
22717 size_needed = GET_MODE_SIZE (word_mode);
22718 break;
22719 case unrolled_loop:
22720 need_zero_guard = true;
22721 size_needed = GET_MODE_SIZE (word_mode) * (TARGET_64BIT ? 4 : 2);
22722 break;
22723 case rep_prefix_8_byte:
22724 size_needed = 8;
22725 break;
22726 case rep_prefix_4_byte:
22727 size_needed = 4;
22728 break;
22729 case rep_prefix_1_byte:
22730 size_needed = 1;
22731 break;
22732 case loop_1_byte:
22733 need_zero_guard = true;
22734 size_needed = 1;
22735 break;
22738 epilogue_size_needed = size_needed;
22740 /* Step 1: Prologue guard. */
22742 /* Alignment code needs count to be in register. */
22743 if (CONST_INT_P (count_exp) && desired_align > align)
22745 if (INTVAL (count_exp) > desired_align
22746 && INTVAL (count_exp) > size_needed)
22748 align_bytes
22749 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22750 if (align_bytes <= 0)
22751 align_bytes = 0;
22752 else
22753 align_bytes = desired_align - align_bytes;
22755 if (align_bytes == 0)
22756 count_exp = force_reg (counter_mode (count_exp), count_exp);
22758 gcc_assert (desired_align >= 1 && align >= 1);
22760 /* Ensure that alignment prologue won't copy past end of block. */
22761 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22763 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22764 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
22765 Make sure it is power of 2. */
22766 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22768 if (count)
22770 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22772 /* If main algorithm works on QImode, no epilogue is needed.
22773 For small sizes just don't align anything. */
22774 if (size_needed == 1)
22775 desired_align = align;
22776 else
22777 goto epilogue;
22780 else
22782 label = gen_label_rtx ();
22783 emit_cmp_and_jump_insns (count_exp,
22784 GEN_INT (epilogue_size_needed),
22785 LTU, 0, counter_mode (count_exp), 1, label);
22786 if (expected_size == -1 || expected_size < epilogue_size_needed)
22787 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22788 else
22789 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22793 /* Emit code to decide on runtime whether library call or inline should be
22794 used. */
22795 if (dynamic_check != -1)
22797 if (CONST_INT_P (count_exp))
22799 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
22801 emit_block_move_via_libcall (dst, src, count_exp, false);
22802 count_exp = const0_rtx;
22803 goto epilogue;
22806 else
22808 rtx hot_label = gen_label_rtx ();
22809 jump_around_label = gen_label_rtx ();
22810 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22811 LEU, 0, GET_MODE (count_exp), 1, hot_label);
22812 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22813 emit_block_move_via_libcall (dst, src, count_exp, false);
22814 emit_jump (jump_around_label);
22815 emit_label (hot_label);
22819 /* Step 2: Alignment prologue. */
22821 if (desired_align > align)
22823 if (align_bytes == 0)
22825 /* Except for the first move in epilogue, we no longer know
22826 constant offset in aliasing info. It don't seems to worth
22827 the pain to maintain it for the first move, so throw away
22828 the info early. */
22829 src = change_address (src, BLKmode, srcreg);
22830 dst = change_address (dst, BLKmode, destreg);
22831 expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
22832 desired_align);
22834 else
22836 /* If we know how many bytes need to be stored before dst is
22837 sufficiently aligned, maintain aliasing info accurately. */
22838 dst = expand_constant_movmem_prologue (dst, &src, destreg, srcreg,
22839 desired_align, align_bytes);
22840 count_exp = plus_constant (counter_mode (count_exp),
22841 count_exp, -align_bytes);
22842 count -= align_bytes;
22844 if (need_zero_guard
22845 && (count < (unsigned HOST_WIDE_INT) size_needed
22846 || (align_bytes == 0
22847 && count < ((unsigned HOST_WIDE_INT) size_needed
22848 + desired_align - align))))
22850 /* It is possible that we copied enough so the main loop will not
22851 execute. */
22852 gcc_assert (size_needed > 1);
22853 if (label == NULL_RTX)
22854 label = gen_label_rtx ();
22855 emit_cmp_and_jump_insns (count_exp,
22856 GEN_INT (size_needed),
22857 LTU, 0, counter_mode (count_exp), 1, label);
22858 if (expected_size == -1
22859 || expected_size < (desired_align - align) / 2 + size_needed)
22860 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22861 else
22862 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22865 if (label && size_needed == 1)
22867 emit_label (label);
22868 LABEL_NUSES (label) = 1;
22869 label = NULL;
22870 epilogue_size_needed = 1;
22872 else if (label == NULL_RTX)
22873 epilogue_size_needed = size_needed;
22875 /* Step 3: Main loop. */
22877 switch (alg)
22879 case libcall:
22880 case no_stringop:
22881 gcc_unreachable ();
22882 case loop_1_byte:
22883 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22884 count_exp, QImode, 1, expected_size);
22885 break;
22886 case loop:
22887 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22888 count_exp, word_mode, 1, expected_size);
22889 break;
22890 case unrolled_loop:
22891 /* Unroll only by factor of 2 in 32bit mode, since we don't have enough
22892 registers for 4 temporaries anyway. */
22893 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22894 count_exp, word_mode, TARGET_64BIT ? 4 : 2,
22895 expected_size);
22896 break;
22897 case rep_prefix_8_byte:
22898 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22899 DImode);
22900 break;
22901 case rep_prefix_4_byte:
22902 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22903 SImode);
22904 break;
22905 case rep_prefix_1_byte:
22906 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22907 QImode);
22908 break;
22910 /* Adjust properly the offset of src and dest memory for aliasing. */
22911 if (CONST_INT_P (count_exp))
22913 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
22914 (count / size_needed) * size_needed);
22915 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22916 (count / size_needed) * size_needed);
22918 else
22920 src = change_address (src, BLKmode, srcreg);
22921 dst = change_address (dst, BLKmode, destreg);
22924 /* Step 4: Epilogue to copy the remaining bytes. */
22925 epilogue:
22926 if (label)
22928 /* When the main loop is done, COUNT_EXP might hold original count,
22929 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22930 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22931 bytes. Compensate if needed. */
22933 if (size_needed < epilogue_size_needed)
22935 tmp =
22936 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22937 GEN_INT (size_needed - 1), count_exp, 1,
22938 OPTAB_DIRECT);
22939 if (tmp != count_exp)
22940 emit_move_insn (count_exp, tmp);
22942 emit_label (label);
22943 LABEL_NUSES (label) = 1;
22946 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22947 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
22948 epilogue_size_needed);
22949 if (jump_around_label)
22950 emit_label (jump_around_label);
22951 return true;
22954 /* Helper function for memcpy. For QImode value 0xXY produce
22955 0xXYXYXYXY of wide specified by MODE. This is essentially
22956 a * 0x10101010, but we can do slightly better than
22957 synth_mult by unwinding the sequence by hand on CPUs with
22958 slow multiply. */
22959 static rtx
22960 promote_duplicated_reg (enum machine_mode mode, rtx val)
22962 enum machine_mode valmode = GET_MODE (val);
22963 rtx tmp;
22964 int nops = mode == DImode ? 3 : 2;
22966 gcc_assert (mode == SImode || mode == DImode);
22967 if (val == const0_rtx)
22968 return copy_to_mode_reg (mode, const0_rtx);
22969 if (CONST_INT_P (val))
22971 HOST_WIDE_INT v = INTVAL (val) & 255;
22973 v |= v << 8;
22974 v |= v << 16;
22975 if (mode == DImode)
22976 v |= (v << 16) << 16;
22977 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
22980 if (valmode == VOIDmode)
22981 valmode = QImode;
22982 if (valmode != QImode)
22983 val = gen_lowpart (QImode, val);
22984 if (mode == QImode)
22985 return val;
22986 if (!TARGET_PARTIAL_REG_STALL)
22987 nops--;
22988 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
22989 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
22990 <= (ix86_cost->shift_const + ix86_cost->add) * nops
22991 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
22993 rtx reg = convert_modes (mode, QImode, val, true);
22994 tmp = promote_duplicated_reg (mode, const1_rtx);
22995 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
22996 OPTAB_DIRECT);
22998 else
23000 rtx reg = convert_modes (mode, QImode, val, true);
23002 if (!TARGET_PARTIAL_REG_STALL)
23003 if (mode == SImode)
23004 emit_insn (gen_movsi_insv_1 (reg, reg));
23005 else
23006 emit_insn (gen_movdi_insv_1 (reg, reg));
23007 else
23009 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
23010 NULL, 1, OPTAB_DIRECT);
23011 reg =
23012 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23014 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
23015 NULL, 1, OPTAB_DIRECT);
23016 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23017 if (mode == SImode)
23018 return reg;
23019 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
23020 NULL, 1, OPTAB_DIRECT);
23021 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23022 return reg;
23026 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
23027 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
23028 alignment from ALIGN to DESIRED_ALIGN. */
23029 static rtx
23030 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align)
23032 rtx promoted_val;
23034 if (TARGET_64BIT
23035 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
23036 promoted_val = promote_duplicated_reg (DImode, val);
23037 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
23038 promoted_val = promote_duplicated_reg (SImode, val);
23039 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
23040 promoted_val = promote_duplicated_reg (HImode, val);
23041 else
23042 promoted_val = val;
23044 return promoted_val;
23047 /* Expand string clear operation (bzero). Use i386 string operations when
23048 profitable. See expand_movmem comment for explanation of individual
23049 steps performed. */
23050 bool
23051 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
23052 rtx expected_align_exp, rtx expected_size_exp)
23054 rtx destreg;
23055 rtx label = NULL;
23056 rtx tmp;
23057 rtx jump_around_label = NULL;
23058 HOST_WIDE_INT align = 1;
23059 unsigned HOST_WIDE_INT count = 0;
23060 HOST_WIDE_INT expected_size = -1;
23061 int size_needed = 0, epilogue_size_needed;
23062 int desired_align = 0, align_bytes = 0;
23063 enum stringop_alg alg;
23064 rtx promoted_val = NULL;
23065 bool force_loopy_epilogue = false;
23066 int dynamic_check;
23067 bool need_zero_guard = false;
23068 bool noalign;
23070 if (CONST_INT_P (align_exp))
23071 align = INTVAL (align_exp);
23072 /* i386 can do misaligned access on reasonably increased cost. */
23073 if (CONST_INT_P (expected_align_exp)
23074 && INTVAL (expected_align_exp) > align)
23075 align = INTVAL (expected_align_exp);
23076 if (CONST_INT_P (count_exp))
23077 count = expected_size = INTVAL (count_exp);
23078 if (CONST_INT_P (expected_size_exp) && count == 0)
23079 expected_size = INTVAL (expected_size_exp);
23081 /* Make sure we don't need to care about overflow later on. */
23082 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
23083 return false;
23085 /* Step 0: Decide on preferred algorithm, desired alignment and
23086 size of chunks to be copied by main loop. */
23088 alg = decide_alg (count, expected_size, true, &dynamic_check, &noalign);
23089 desired_align = decide_alignment (align, alg, expected_size);
23091 if (!TARGET_ALIGN_STRINGOPS || noalign)
23092 align = desired_align;
23094 if (alg == libcall)
23095 return false;
23096 gcc_assert (alg != no_stringop);
23097 if (!count)
23098 count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp);
23099 destreg = copy_addr_to_reg (XEXP (dst, 0));
23100 switch (alg)
23102 case libcall:
23103 case no_stringop:
23104 gcc_unreachable ();
23105 case loop:
23106 need_zero_guard = true;
23107 size_needed = GET_MODE_SIZE (word_mode);
23108 break;
23109 case unrolled_loop:
23110 need_zero_guard = true;
23111 size_needed = GET_MODE_SIZE (word_mode) * 4;
23112 break;
23113 case rep_prefix_8_byte:
23114 size_needed = 8;
23115 break;
23116 case rep_prefix_4_byte:
23117 size_needed = 4;
23118 break;
23119 case rep_prefix_1_byte:
23120 size_needed = 1;
23121 break;
23122 case loop_1_byte:
23123 need_zero_guard = true;
23124 size_needed = 1;
23125 break;
23127 epilogue_size_needed = size_needed;
23129 /* Step 1: Prologue guard. */
23131 /* Alignment code needs count to be in register. */
23132 if (CONST_INT_P (count_exp) && desired_align > align)
23134 if (INTVAL (count_exp) > desired_align
23135 && INTVAL (count_exp) > size_needed)
23137 align_bytes
23138 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
23139 if (align_bytes <= 0)
23140 align_bytes = 0;
23141 else
23142 align_bytes = desired_align - align_bytes;
23144 if (align_bytes == 0)
23146 enum machine_mode mode = SImode;
23147 if (TARGET_64BIT && (count & ~0xffffffff))
23148 mode = DImode;
23149 count_exp = force_reg (mode, count_exp);
23152 /* Do the cheap promotion to allow better CSE across the
23153 main loop and epilogue (ie one load of the big constant in the
23154 front of all code. */
23155 if (CONST_INT_P (val_exp))
23156 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
23157 desired_align, align);
23158 /* Ensure that alignment prologue won't copy past end of block. */
23159 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
23161 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
23162 /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
23163 Make sure it is power of 2. */
23164 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
23166 /* To improve performance of small blocks, we jump around the VAL
23167 promoting mode. This mean that if the promoted VAL is not constant,
23168 we might not use it in the epilogue and have to use byte
23169 loop variant. */
23170 if (epilogue_size_needed > 2 && !promoted_val)
23171 force_loopy_epilogue = true;
23172 if (count)
23174 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
23176 /* If main algorithm works on QImode, no epilogue is needed.
23177 For small sizes just don't align anything. */
23178 if (size_needed == 1)
23179 desired_align = align;
23180 else
23181 goto epilogue;
23184 else
23186 label = gen_label_rtx ();
23187 emit_cmp_and_jump_insns (count_exp,
23188 GEN_INT (epilogue_size_needed),
23189 LTU, 0, counter_mode (count_exp), 1, label);
23190 if (expected_size == -1 || expected_size <= epilogue_size_needed)
23191 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23192 else
23193 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23196 if (dynamic_check != -1)
23198 rtx hot_label = gen_label_rtx ();
23199 jump_around_label = gen_label_rtx ();
23200 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
23201 LEU, 0, counter_mode (count_exp), 1, hot_label);
23202 predict_jump (REG_BR_PROB_BASE * 90 / 100);
23203 set_storage_via_libcall (dst, count_exp, val_exp, false);
23204 emit_jump (jump_around_label);
23205 emit_label (hot_label);
23208 /* Step 2: Alignment prologue. */
23210 /* Do the expensive promotion once we branched off the small blocks. */
23211 if (!promoted_val)
23212 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
23213 desired_align, align);
23214 gcc_assert (desired_align >= 1 && align >= 1);
23216 if (desired_align > align)
23218 if (align_bytes == 0)
23220 /* Except for the first move in epilogue, we no longer know
23221 constant offset in aliasing info. It don't seems to worth
23222 the pain to maintain it for the first move, so throw away
23223 the info early. */
23224 dst = change_address (dst, BLKmode, destreg);
23225 expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
23226 desired_align);
23228 else
23230 /* If we know how many bytes need to be stored before dst is
23231 sufficiently aligned, maintain aliasing info accurately. */
23232 dst = expand_constant_setmem_prologue (dst, destreg, promoted_val,
23233 desired_align, align_bytes);
23234 count_exp = plus_constant (counter_mode (count_exp),
23235 count_exp, -align_bytes);
23236 count -= align_bytes;
23238 if (need_zero_guard
23239 && (count < (unsigned HOST_WIDE_INT) size_needed
23240 || (align_bytes == 0
23241 && count < ((unsigned HOST_WIDE_INT) size_needed
23242 + desired_align - align))))
23244 /* It is possible that we copied enough so the main loop will not
23245 execute. */
23246 gcc_assert (size_needed > 1);
23247 if (label == NULL_RTX)
23248 label = gen_label_rtx ();
23249 emit_cmp_and_jump_insns (count_exp,
23250 GEN_INT (size_needed),
23251 LTU, 0, counter_mode (count_exp), 1, label);
23252 if (expected_size == -1
23253 || expected_size < (desired_align - align) / 2 + size_needed)
23254 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23255 else
23256 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23259 if (label && size_needed == 1)
23261 emit_label (label);
23262 LABEL_NUSES (label) = 1;
23263 label = NULL;
23264 promoted_val = val_exp;
23265 epilogue_size_needed = 1;
23267 else if (label == NULL_RTX)
23268 epilogue_size_needed = size_needed;
23270 /* Step 3: Main loop. */
23272 switch (alg)
23274 case libcall:
23275 case no_stringop:
23276 gcc_unreachable ();
23277 case loop_1_byte:
23278 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23279 count_exp, QImode, 1, expected_size);
23280 break;
23281 case loop:
23282 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23283 count_exp, word_mode, 1, expected_size);
23284 break;
23285 case unrolled_loop:
23286 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23287 count_exp, word_mode, 4, expected_size);
23288 break;
23289 case rep_prefix_8_byte:
23290 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23291 DImode, val_exp);
23292 break;
23293 case rep_prefix_4_byte:
23294 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23295 SImode, val_exp);
23296 break;
23297 case rep_prefix_1_byte:
23298 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23299 QImode, val_exp);
23300 break;
23302 /* Adjust properly the offset of src and dest memory for aliasing. */
23303 if (CONST_INT_P (count_exp))
23304 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
23305 (count / size_needed) * size_needed);
23306 else
23307 dst = change_address (dst, BLKmode, destreg);
23309 /* Step 4: Epilogue to copy the remaining bytes. */
23311 if (label)
23313 /* When the main loop is done, COUNT_EXP might hold original count,
23314 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
23315 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
23316 bytes. Compensate if needed. */
23318 if (size_needed < epilogue_size_needed)
23320 tmp =
23321 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
23322 GEN_INT (size_needed - 1), count_exp, 1,
23323 OPTAB_DIRECT);
23324 if (tmp != count_exp)
23325 emit_move_insn (count_exp, tmp);
23327 emit_label (label);
23328 LABEL_NUSES (label) = 1;
23330 epilogue:
23331 if (count_exp != const0_rtx && epilogue_size_needed > 1)
23333 if (force_loopy_epilogue)
23334 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
23335 epilogue_size_needed);
23336 else
23337 expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
23338 epilogue_size_needed);
23340 if (jump_around_label)
23341 emit_label (jump_around_label);
23342 return true;
23345 /* Expand the appropriate insns for doing strlen if not just doing
23346 repnz; scasb
23348 out = result, initialized with the start address
23349 align_rtx = alignment of the address.
23350 scratch = scratch register, initialized with the startaddress when
23351 not aligned, otherwise undefined
23353 This is just the body. It needs the initializations mentioned above and
23354 some address computing at the end. These things are done in i386.md. */
23356 static void
23357 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
23359 int align;
23360 rtx tmp;
23361 rtx align_2_label = NULL_RTX;
23362 rtx align_3_label = NULL_RTX;
23363 rtx align_4_label = gen_label_rtx ();
23364 rtx end_0_label = gen_label_rtx ();
23365 rtx mem;
23366 rtx tmpreg = gen_reg_rtx (SImode);
23367 rtx scratch = gen_reg_rtx (SImode);
23368 rtx cmp;
23370 align = 0;
23371 if (CONST_INT_P (align_rtx))
23372 align = INTVAL (align_rtx);
23374 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
23376 /* Is there a known alignment and is it less than 4? */
23377 if (align < 4)
23379 rtx scratch1 = gen_reg_rtx (Pmode);
23380 emit_move_insn (scratch1, out);
23381 /* Is there a known alignment and is it not 2? */
23382 if (align != 2)
23384 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
23385 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
23387 /* Leave just the 3 lower bits. */
23388 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
23389 NULL_RTX, 0, OPTAB_WIDEN);
23391 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
23392 Pmode, 1, align_4_label);
23393 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
23394 Pmode, 1, align_2_label);
23395 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
23396 Pmode, 1, align_3_label);
23398 else
23400 /* Since the alignment is 2, we have to check 2 or 0 bytes;
23401 check if is aligned to 4 - byte. */
23403 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
23404 NULL_RTX, 0, OPTAB_WIDEN);
23406 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
23407 Pmode, 1, align_4_label);
23410 mem = change_address (src, QImode, out);
23412 /* Now compare the bytes. */
23414 /* Compare the first n unaligned byte on a byte per byte basis. */
23415 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
23416 QImode, 1, end_0_label);
23418 /* Increment the address. */
23419 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23421 /* Not needed with an alignment of 2 */
23422 if (align != 2)
23424 emit_label (align_2_label);
23426 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23427 end_0_label);
23429 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23431 emit_label (align_3_label);
23434 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23435 end_0_label);
23437 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23440 /* Generate loop to check 4 bytes at a time. It is not a good idea to
23441 align this loop. It gives only huge programs, but does not help to
23442 speed up. */
23443 emit_label (align_4_label);
23445 mem = change_address (src, SImode, out);
23446 emit_move_insn (scratch, mem);
23447 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
23449 /* This formula yields a nonzero result iff one of the bytes is zero.
23450 This saves three branches inside loop and many cycles. */
23452 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
23453 emit_insn (gen_one_cmplsi2 (scratch, scratch));
23454 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
23455 emit_insn (gen_andsi3 (tmpreg, tmpreg,
23456 gen_int_mode (0x80808080, SImode)));
23457 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
23458 align_4_label);
23460 if (TARGET_CMOVE)
23462 rtx reg = gen_reg_rtx (SImode);
23463 rtx reg2 = gen_reg_rtx (Pmode);
23464 emit_move_insn (reg, tmpreg);
23465 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
23467 /* If zero is not in the first two bytes, move two bytes forward. */
23468 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23469 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23470 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23471 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
23472 gen_rtx_IF_THEN_ELSE (SImode, tmp,
23473 reg,
23474 tmpreg)));
23475 /* Emit lea manually to avoid clobbering of flags. */
23476 emit_insn (gen_rtx_SET (SImode, reg2,
23477 gen_rtx_PLUS (Pmode, out, const2_rtx)));
23479 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23480 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23481 emit_insn (gen_rtx_SET (VOIDmode, out,
23482 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
23483 reg2,
23484 out)));
23486 else
23488 rtx end_2_label = gen_label_rtx ();
23489 /* Is zero in the first two bytes? */
23491 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23492 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23493 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
23494 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23495 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
23496 pc_rtx);
23497 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
23498 JUMP_LABEL (tmp) = end_2_label;
23500 /* Not in the first two. Move two bytes forward. */
23501 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
23502 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
23504 emit_label (end_2_label);
23508 /* Avoid branch in fixing the byte. */
23509 tmpreg = gen_lowpart (QImode, tmpreg);
23510 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
23511 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
23512 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
23513 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
23515 emit_label (end_0_label);
23518 /* Expand strlen. */
23520 bool
23521 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
23523 rtx addr, scratch1, scratch2, scratch3, scratch4;
23525 /* The generic case of strlen expander is long. Avoid it's
23526 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
23528 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23529 && !TARGET_INLINE_ALL_STRINGOPS
23530 && !optimize_insn_for_size_p ()
23531 && (!CONST_INT_P (align) || INTVAL (align) < 4))
23532 return false;
23534 addr = force_reg (Pmode, XEXP (src, 0));
23535 scratch1 = gen_reg_rtx (Pmode);
23537 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23538 && !optimize_insn_for_size_p ())
23540 /* Well it seems that some optimizer does not combine a call like
23541 foo(strlen(bar), strlen(bar));
23542 when the move and the subtraction is done here. It does calculate
23543 the length just once when these instructions are done inside of
23544 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
23545 often used and I use one fewer register for the lifetime of
23546 output_strlen_unroll() this is better. */
23548 emit_move_insn (out, addr);
23550 ix86_expand_strlensi_unroll_1 (out, src, align);
23552 /* strlensi_unroll_1 returns the address of the zero at the end of
23553 the string, like memchr(), so compute the length by subtracting
23554 the start address. */
23555 emit_insn (ix86_gen_sub3 (out, out, addr));
23557 else
23559 rtx unspec;
23561 /* Can't use this if the user has appropriated eax, ecx, or edi. */
23562 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
23563 return false;
23565 scratch2 = gen_reg_rtx (Pmode);
23566 scratch3 = gen_reg_rtx (Pmode);
23567 scratch4 = force_reg (Pmode, constm1_rtx);
23569 emit_move_insn (scratch3, addr);
23570 eoschar = force_reg (QImode, eoschar);
23572 src = replace_equiv_address_nv (src, scratch3);
23574 /* If .md starts supporting :P, this can be done in .md. */
23575 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
23576 scratch4), UNSPEC_SCAS);
23577 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
23578 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
23579 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
23581 return true;
23584 /* For given symbol (function) construct code to compute address of it's PLT
23585 entry in large x86-64 PIC model. */
23586 static rtx
23587 construct_plt_address (rtx symbol)
23589 rtx tmp, unspec;
23591 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
23592 gcc_assert (ix86_cmodel == CM_LARGE_PIC);
23593 gcc_assert (Pmode == DImode);
23595 tmp = gen_reg_rtx (Pmode);
23596 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
23598 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
23599 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
23600 return tmp;
23604 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
23605 rtx callarg2,
23606 rtx pop, bool sibcall)
23608 /* We need to represent that SI and DI registers are clobbered
23609 by SYSV calls. */
23610 static int clobbered_registers[] = {
23611 XMM6_REG, XMM7_REG, XMM8_REG,
23612 XMM9_REG, XMM10_REG, XMM11_REG,
23613 XMM12_REG, XMM13_REG, XMM14_REG,
23614 XMM15_REG, SI_REG, DI_REG
23616 rtx vec[ARRAY_SIZE (clobbered_registers) + 3];
23617 rtx use = NULL, call;
23618 unsigned int vec_len;
23620 if (pop == const0_rtx)
23621 pop = NULL;
23622 gcc_assert (!TARGET_64BIT || !pop);
23624 if (TARGET_MACHO && !TARGET_64BIT)
23626 #if TARGET_MACHO
23627 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
23628 fnaddr = machopic_indirect_call_target (fnaddr);
23629 #endif
23631 else
23633 /* Static functions and indirect calls don't need the pic register. */
23634 if (flag_pic && (!TARGET_64BIT || ix86_cmodel == CM_LARGE_PIC)
23635 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23636 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
23637 use_reg (&use, pic_offset_table_rtx);
23640 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
23642 rtx al = gen_rtx_REG (QImode, AX_REG);
23643 emit_move_insn (al, callarg2);
23644 use_reg (&use, al);
23647 if (ix86_cmodel == CM_LARGE_PIC
23648 && MEM_P (fnaddr)
23649 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23650 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
23651 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
23652 else if (sibcall
23653 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
23654 : !call_insn_operand (XEXP (fnaddr, 0), word_mode))
23656 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
23657 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
23660 vec_len = 0;
23661 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
23662 if (retval)
23663 call = gen_rtx_SET (VOIDmode, retval, call);
23664 vec[vec_len++] = call;
23666 if (pop)
23668 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
23669 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
23670 vec[vec_len++] = pop;
23673 if (TARGET_64BIT_MS_ABI
23674 && (!callarg2 || INTVAL (callarg2) != -2))
23676 unsigned i;
23678 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
23679 UNSPEC_MS_TO_SYSV_CALL);
23681 for (i = 0; i < ARRAY_SIZE (clobbered_registers); i++)
23682 vec[vec_len++]
23683 = gen_rtx_CLOBBER (VOIDmode,
23684 gen_rtx_REG (SSE_REGNO_P (clobbered_registers[i])
23685 ? TImode : DImode,
23686 clobbered_registers[i]));
23689 if (vec_len > 1)
23690 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
23691 call = emit_call_insn (call);
23692 if (use)
23693 CALL_INSN_FUNCTION_USAGE (call) = use;
23695 return call;
23698 /* Output the assembly for a call instruction. */
23700 const char *
23701 ix86_output_call_insn (rtx insn, rtx call_op)
23703 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
23704 bool seh_nop_p = false;
23705 const char *xasm;
23707 if (SIBLING_CALL_P (insn))
23709 if (direct_p)
23710 xasm = "jmp\t%P0";
23711 /* SEH epilogue detection requires the indirect branch case
23712 to include REX.W. */
23713 else if (TARGET_SEH)
23714 xasm = "rex.W jmp %A0";
23715 else
23716 xasm = "jmp\t%A0";
23718 output_asm_insn (xasm, &call_op);
23719 return "";
23722 /* SEH unwinding can require an extra nop to be emitted in several
23723 circumstances. Determine if we have one of those. */
23724 if (TARGET_SEH)
23726 rtx i;
23728 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
23730 /* If we get to another real insn, we don't need the nop. */
23731 if (INSN_P (i))
23732 break;
23734 /* If we get to the epilogue note, prevent a catch region from
23735 being adjacent to the standard epilogue sequence. If non-
23736 call-exceptions, we'll have done this during epilogue emission. */
23737 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
23738 && !flag_non_call_exceptions
23739 && !can_throw_internal (insn))
23741 seh_nop_p = true;
23742 break;
23746 /* If we didn't find a real insn following the call, prevent the
23747 unwinder from looking into the next function. */
23748 if (i == NULL)
23749 seh_nop_p = true;
23752 if (direct_p)
23753 xasm = "call\t%P0";
23754 else
23755 xasm = "call\t%A0";
23757 output_asm_insn (xasm, &call_op);
23759 if (seh_nop_p)
23760 return "nop";
23762 return "";
23765 /* Clear stack slot assignments remembered from previous functions.
23766 This is called from INIT_EXPANDERS once before RTL is emitted for each
23767 function. */
23769 static struct machine_function *
23770 ix86_init_machine_status (void)
23772 struct machine_function *f;
23774 f = ggc_alloc_cleared_machine_function ();
23775 f->use_fast_prologue_epilogue_nregs = -1;
23776 f->call_abi = ix86_abi;
23778 return f;
23781 /* Return a MEM corresponding to a stack slot with mode MODE.
23782 Allocate a new slot if necessary.
23784 The RTL for a function can have several slots available: N is
23785 which slot to use. */
23788 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
23790 struct stack_local_entry *s;
23792 gcc_assert (n < MAX_386_STACK_LOCALS);
23794 for (s = ix86_stack_locals; s; s = s->next)
23795 if (s->mode == mode && s->n == n)
23796 return validize_mem (copy_rtx (s->rtl));
23798 s = ggc_alloc_stack_local_entry ();
23799 s->n = n;
23800 s->mode = mode;
23801 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
23803 s->next = ix86_stack_locals;
23804 ix86_stack_locals = s;
23805 return validize_mem (s->rtl);
23808 static void
23809 ix86_instantiate_decls (void)
23811 struct stack_local_entry *s;
23813 for (s = ix86_stack_locals; s; s = s->next)
23814 if (s->rtl != NULL_RTX)
23815 instantiate_decl_rtl (s->rtl);
23818 /* Calculate the length of the memory address in the instruction encoding.
23819 Includes addr32 prefix, does not include the one-byte modrm, opcode,
23820 or other prefixes. We never generate addr32 prefix for LEA insn. */
23823 memory_address_length (rtx addr, bool lea)
23825 struct ix86_address parts;
23826 rtx base, index, disp;
23827 int len;
23828 int ok;
23830 if (GET_CODE (addr) == PRE_DEC
23831 || GET_CODE (addr) == POST_INC
23832 || GET_CODE (addr) == PRE_MODIFY
23833 || GET_CODE (addr) == POST_MODIFY)
23834 return 0;
23836 ok = ix86_decompose_address (addr, &parts);
23837 gcc_assert (ok);
23839 len = (parts.seg == SEG_DEFAULT) ? 0 : 1;
23841 /* If this is not LEA instruction, add the length of addr32 prefix. */
23842 if (TARGET_64BIT && !lea
23843 && (SImode_address_operand (addr, VOIDmode)
23844 || (parts.base && GET_MODE (parts.base) == SImode)
23845 || (parts.index && GET_MODE (parts.index) == SImode)))
23846 len++;
23848 base = parts.base;
23849 index = parts.index;
23850 disp = parts.disp;
23852 if (base && GET_CODE (base) == SUBREG)
23853 base = SUBREG_REG (base);
23854 if (index && GET_CODE (index) == SUBREG)
23855 index = SUBREG_REG (index);
23857 gcc_assert (base == NULL_RTX || REG_P (base));
23858 gcc_assert (index == NULL_RTX || REG_P (index));
23860 /* Rule of thumb:
23861 - esp as the base always wants an index,
23862 - ebp as the base always wants a displacement,
23863 - r12 as the base always wants an index,
23864 - r13 as the base always wants a displacement. */
23866 /* Register Indirect. */
23867 if (base && !index && !disp)
23869 /* esp (for its index) and ebp (for its displacement) need
23870 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
23871 code. */
23872 if (base == arg_pointer_rtx
23873 || base == frame_pointer_rtx
23874 || REGNO (base) == SP_REG
23875 || REGNO (base) == BP_REG
23876 || REGNO (base) == R12_REG
23877 || REGNO (base) == R13_REG)
23878 len++;
23881 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
23882 is not disp32, but disp32(%rip), so for disp32
23883 SIB byte is needed, unless print_operand_address
23884 optimizes it into disp32(%rip) or (%rip) is implied
23885 by UNSPEC. */
23886 else if (disp && !base && !index)
23888 len += 4;
23889 if (TARGET_64BIT)
23891 rtx symbol = disp;
23893 if (GET_CODE (disp) == CONST)
23894 symbol = XEXP (disp, 0);
23895 if (GET_CODE (symbol) == PLUS
23896 && CONST_INT_P (XEXP (symbol, 1)))
23897 symbol = XEXP (symbol, 0);
23899 if (GET_CODE (symbol) != LABEL_REF
23900 && (GET_CODE (symbol) != SYMBOL_REF
23901 || SYMBOL_REF_TLS_MODEL (symbol) != 0)
23902 && (GET_CODE (symbol) != UNSPEC
23903 || (XINT (symbol, 1) != UNSPEC_GOTPCREL
23904 && XINT (symbol, 1) != UNSPEC_PCREL
23905 && XINT (symbol, 1) != UNSPEC_GOTNTPOFF)))
23906 len++;
23909 else
23911 /* Find the length of the displacement constant. */
23912 if (disp)
23914 if (base && satisfies_constraint_K (disp))
23915 len += 1;
23916 else
23917 len += 4;
23919 /* ebp always wants a displacement. Similarly r13. */
23920 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
23921 len++;
23923 /* An index requires the two-byte modrm form.... */
23924 if (index
23925 /* ...like esp (or r12), which always wants an index. */
23926 || base == arg_pointer_rtx
23927 || base == frame_pointer_rtx
23928 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
23929 len++;
23932 return len;
23935 /* Compute default value for "length_immediate" attribute. When SHORTFORM
23936 is set, expect that insn have 8bit immediate alternative. */
23938 ix86_attr_length_immediate_default (rtx insn, bool shortform)
23940 int len = 0;
23941 int i;
23942 extract_insn_cached (insn);
23943 for (i = recog_data.n_operands - 1; i >= 0; --i)
23944 if (CONSTANT_P (recog_data.operand[i]))
23946 enum attr_mode mode = get_attr_mode (insn);
23948 gcc_assert (!len);
23949 if (shortform && CONST_INT_P (recog_data.operand[i]))
23951 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
23952 switch (mode)
23954 case MODE_QI:
23955 len = 1;
23956 continue;
23957 case MODE_HI:
23958 ival = trunc_int_for_mode (ival, HImode);
23959 break;
23960 case MODE_SI:
23961 ival = trunc_int_for_mode (ival, SImode);
23962 break;
23963 default:
23964 break;
23966 if (IN_RANGE (ival, -128, 127))
23968 len = 1;
23969 continue;
23972 switch (mode)
23974 case MODE_QI:
23975 len = 1;
23976 break;
23977 case MODE_HI:
23978 len = 2;
23979 break;
23980 case MODE_SI:
23981 len = 4;
23982 break;
23983 /* Immediates for DImode instructions are encoded
23984 as 32bit sign extended values. */
23985 case MODE_DI:
23986 len = 4;
23987 break;
23988 default:
23989 fatal_insn ("unknown insn mode", insn);
23992 return len;
23995 /* Compute default value for "length_address" attribute. */
23997 ix86_attr_length_address_default (rtx insn)
23999 int i;
24001 if (get_attr_type (insn) == TYPE_LEA)
24003 rtx set = PATTERN (insn), addr;
24005 if (GET_CODE (set) == PARALLEL)
24006 set = XVECEXP (set, 0, 0);
24008 gcc_assert (GET_CODE (set) == SET);
24010 addr = SET_SRC (set);
24012 return memory_address_length (addr, true);
24015 extract_insn_cached (insn);
24016 for (i = recog_data.n_operands - 1; i >= 0; --i)
24017 if (MEM_P (recog_data.operand[i]))
24019 constrain_operands_cached (reload_completed);
24020 if (which_alternative != -1)
24022 const char *constraints = recog_data.constraints[i];
24023 int alt = which_alternative;
24025 while (*constraints == '=' || *constraints == '+')
24026 constraints++;
24027 while (alt-- > 0)
24028 while (*constraints++ != ',')
24030 /* Skip ignored operands. */
24031 if (*constraints == 'X')
24032 continue;
24034 return memory_address_length (XEXP (recog_data.operand[i], 0), false);
24036 return 0;
24039 /* Compute default value for "length_vex" attribute. It includes
24040 2 or 3 byte VEX prefix and 1 opcode byte. */
24043 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
24045 int i;
24047 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
24048 byte VEX prefix. */
24049 if (!has_0f_opcode || has_vex_w)
24050 return 3 + 1;
24052 /* We can always use 2 byte VEX prefix in 32bit. */
24053 if (!TARGET_64BIT)
24054 return 2 + 1;
24056 extract_insn_cached (insn);
24058 for (i = recog_data.n_operands - 1; i >= 0; --i)
24059 if (REG_P (recog_data.operand[i]))
24061 /* REX.W bit uses 3 byte VEX prefix. */
24062 if (GET_MODE (recog_data.operand[i]) == DImode
24063 && GENERAL_REG_P (recog_data.operand[i]))
24064 return 3 + 1;
24066 else
24068 /* REX.X or REX.B bits use 3 byte VEX prefix. */
24069 if (MEM_P (recog_data.operand[i])
24070 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
24071 return 3 + 1;
24074 return 2 + 1;
24077 /* Return the maximum number of instructions a cpu can issue. */
24079 static int
24080 ix86_issue_rate (void)
24082 switch (ix86_tune)
24084 case PROCESSOR_PENTIUM:
24085 case PROCESSOR_ATOM:
24086 case PROCESSOR_K6:
24087 case PROCESSOR_BTVER2:
24088 return 2;
24090 case PROCESSOR_PENTIUMPRO:
24091 case PROCESSOR_PENTIUM4:
24092 case PROCESSOR_CORE2:
24093 case PROCESSOR_COREI7:
24094 case PROCESSOR_HASWELL:
24095 case PROCESSOR_ATHLON:
24096 case PROCESSOR_K8:
24097 case PROCESSOR_AMDFAM10:
24098 case PROCESSOR_NOCONA:
24099 case PROCESSOR_GENERIC32:
24100 case PROCESSOR_GENERIC64:
24101 case PROCESSOR_BDVER1:
24102 case PROCESSOR_BDVER2:
24103 case PROCESSOR_BDVER3:
24104 case PROCESSOR_BTVER1:
24105 return 3;
24107 default:
24108 return 1;
24112 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
24113 by DEP_INSN and nothing set by DEP_INSN. */
24115 static bool
24116 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
24118 rtx set, set2;
24120 /* Simplify the test for uninteresting insns. */
24121 if (insn_type != TYPE_SETCC
24122 && insn_type != TYPE_ICMOV
24123 && insn_type != TYPE_FCMOV
24124 && insn_type != TYPE_IBR)
24125 return false;
24127 if ((set = single_set (dep_insn)) != 0)
24129 set = SET_DEST (set);
24130 set2 = NULL_RTX;
24132 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
24133 && XVECLEN (PATTERN (dep_insn), 0) == 2
24134 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
24135 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
24137 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
24138 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
24140 else
24141 return false;
24143 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
24144 return false;
24146 /* This test is true if the dependent insn reads the flags but
24147 not any other potentially set register. */
24148 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
24149 return false;
24151 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
24152 return false;
24154 return true;
24157 /* Return true iff USE_INSN has a memory address with operands set by
24158 SET_INSN. */
24160 bool
24161 ix86_agi_dependent (rtx set_insn, rtx use_insn)
24163 int i;
24164 extract_insn_cached (use_insn);
24165 for (i = recog_data.n_operands - 1; i >= 0; --i)
24166 if (MEM_P (recog_data.operand[i]))
24168 rtx addr = XEXP (recog_data.operand[i], 0);
24169 return modified_in_p (addr, set_insn) != 0;
24171 return false;
24174 static int
24175 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
24177 enum attr_type insn_type, dep_insn_type;
24178 enum attr_memory memory;
24179 rtx set, set2;
24180 int dep_insn_code_number;
24182 /* Anti and output dependencies have zero cost on all CPUs. */
24183 if (REG_NOTE_KIND (link) != 0)
24184 return 0;
24186 dep_insn_code_number = recog_memoized (dep_insn);
24188 /* If we can't recognize the insns, we can't really do anything. */
24189 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
24190 return cost;
24192 insn_type = get_attr_type (insn);
24193 dep_insn_type = get_attr_type (dep_insn);
24195 switch (ix86_tune)
24197 case PROCESSOR_PENTIUM:
24198 /* Address Generation Interlock adds a cycle of latency. */
24199 if (insn_type == TYPE_LEA)
24201 rtx addr = PATTERN (insn);
24203 if (GET_CODE (addr) == PARALLEL)
24204 addr = XVECEXP (addr, 0, 0);
24206 gcc_assert (GET_CODE (addr) == SET);
24208 addr = SET_SRC (addr);
24209 if (modified_in_p (addr, dep_insn))
24210 cost += 1;
24212 else if (ix86_agi_dependent (dep_insn, insn))
24213 cost += 1;
24215 /* ??? Compares pair with jump/setcc. */
24216 if (ix86_flags_dependent (insn, dep_insn, insn_type))
24217 cost = 0;
24219 /* Floating point stores require value to be ready one cycle earlier. */
24220 if (insn_type == TYPE_FMOV
24221 && get_attr_memory (insn) == MEMORY_STORE
24222 && !ix86_agi_dependent (dep_insn, insn))
24223 cost += 1;
24224 break;
24226 case PROCESSOR_PENTIUMPRO:
24227 memory = get_attr_memory (insn);
24229 /* INT->FP conversion is expensive. */
24230 if (get_attr_fp_int_src (dep_insn))
24231 cost += 5;
24233 /* There is one cycle extra latency between an FP op and a store. */
24234 if (insn_type == TYPE_FMOV
24235 && (set = single_set (dep_insn)) != NULL_RTX
24236 && (set2 = single_set (insn)) != NULL_RTX
24237 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
24238 && MEM_P (SET_DEST (set2)))
24239 cost += 1;
24241 /* Show ability of reorder buffer to hide latency of load by executing
24242 in parallel with previous instruction in case
24243 previous instruction is not needed to compute the address. */
24244 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24245 && !ix86_agi_dependent (dep_insn, insn))
24247 /* Claim moves to take one cycle, as core can issue one load
24248 at time and the next load can start cycle later. */
24249 if (dep_insn_type == TYPE_IMOV
24250 || dep_insn_type == TYPE_FMOV)
24251 cost = 1;
24252 else if (cost > 1)
24253 cost--;
24255 break;
24257 case PROCESSOR_K6:
24258 memory = get_attr_memory (insn);
24260 /* The esp dependency is resolved before the instruction is really
24261 finished. */
24262 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
24263 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
24264 return 1;
24266 /* INT->FP conversion is expensive. */
24267 if (get_attr_fp_int_src (dep_insn))
24268 cost += 5;
24270 /* Show ability of reorder buffer to hide latency of load by executing
24271 in parallel with previous instruction in case
24272 previous instruction is not needed to compute the address. */
24273 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24274 && !ix86_agi_dependent (dep_insn, insn))
24276 /* Claim moves to take one cycle, as core can issue one load
24277 at time and the next load can start cycle later. */
24278 if (dep_insn_type == TYPE_IMOV
24279 || dep_insn_type == TYPE_FMOV)
24280 cost = 1;
24281 else if (cost > 2)
24282 cost -= 2;
24283 else
24284 cost = 1;
24286 break;
24288 case PROCESSOR_ATHLON:
24289 case PROCESSOR_K8:
24290 case PROCESSOR_AMDFAM10:
24291 case PROCESSOR_BDVER1:
24292 case PROCESSOR_BDVER2:
24293 case PROCESSOR_BDVER3:
24294 case PROCESSOR_BTVER1:
24295 case PROCESSOR_BTVER2:
24296 case PROCESSOR_ATOM:
24297 case PROCESSOR_GENERIC32:
24298 case PROCESSOR_GENERIC64:
24299 memory = get_attr_memory (insn);
24301 /* Show ability of reorder buffer to hide latency of load by executing
24302 in parallel with previous instruction in case
24303 previous instruction is not needed to compute the address. */
24304 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24305 && !ix86_agi_dependent (dep_insn, insn))
24307 enum attr_unit unit = get_attr_unit (insn);
24308 int loadcost = 3;
24310 /* Because of the difference between the length of integer and
24311 floating unit pipeline preparation stages, the memory operands
24312 for floating point are cheaper.
24314 ??? For Athlon it the difference is most probably 2. */
24315 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
24316 loadcost = 3;
24317 else
24318 loadcost = TARGET_ATHLON ? 2 : 0;
24320 if (cost >= loadcost)
24321 cost -= loadcost;
24322 else
24323 cost = 0;
24326 default:
24327 break;
24330 return cost;
24333 /* How many alternative schedules to try. This should be as wide as the
24334 scheduling freedom in the DFA, but no wider. Making this value too
24335 large results extra work for the scheduler. */
24337 static int
24338 ia32_multipass_dfa_lookahead (void)
24340 switch (ix86_tune)
24342 case PROCESSOR_PENTIUM:
24343 return 2;
24345 case PROCESSOR_PENTIUMPRO:
24346 case PROCESSOR_K6:
24347 return 1;
24349 case PROCESSOR_CORE2:
24350 case PROCESSOR_COREI7:
24351 case PROCESSOR_HASWELL:
24352 case PROCESSOR_ATOM:
24353 /* Generally, we want haifa-sched:max_issue() to look ahead as far
24354 as many instructions can be executed on a cycle, i.e.,
24355 issue_rate. I wonder why tuning for many CPUs does not do this. */
24356 if (reload_completed)
24357 return ix86_issue_rate ();
24358 /* Don't use lookahead for pre-reload schedule to save compile time. */
24359 return 0;
24361 default:
24362 return 0;
24366 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
24367 execution. It is applied if
24368 (1) IMUL instruction is on the top of list;
24369 (2) There exists the only producer of independent IMUL instruction in
24370 ready list;
24371 (3) Put found producer on the top of ready list.
24372 Returns issue rate. */
24374 static int
24375 ix86_sched_reorder(FILE *dump, int sched_verbose, rtx *ready, int *pn_ready,
24376 int clock_var ATTRIBUTE_UNUSED)
24378 static int issue_rate = -1;
24379 int n_ready = *pn_ready;
24380 rtx insn, insn1, insn2;
24381 int i;
24382 sd_iterator_def sd_it;
24383 dep_t dep;
24384 int index = -1;
24386 /* Set up issue rate. */
24387 issue_rate = ix86_issue_rate();
24389 /* Do reodering for Atom only. */
24390 if (ix86_tune != PROCESSOR_ATOM)
24391 return issue_rate;
24392 /* Do not perform ready list reodering for pre-reload schedule pass. */
24393 if (!reload_completed)
24394 return issue_rate;
24395 /* Nothing to do if ready list contains only 1 instruction. */
24396 if (n_ready <= 1)
24397 return issue_rate;
24399 /* Check that IMUL instruction is on the top of ready list. */
24400 insn = ready[n_ready - 1];
24401 if (!NONDEBUG_INSN_P (insn))
24402 return issue_rate;
24403 insn = PATTERN (insn);
24404 if (GET_CODE (insn) == PARALLEL)
24405 insn = XVECEXP (insn, 0, 0);
24406 if (GET_CODE (insn) != SET)
24407 return issue_rate;
24408 if (!(GET_CODE (SET_SRC (insn)) == MULT
24409 && GET_MODE (SET_SRC (insn)) == SImode))
24410 return issue_rate;
24412 /* Search for producer of independent IMUL instruction. */
24413 for (i = n_ready - 2; i>= 0; i--)
24415 insn = ready[i];
24416 if (!NONDEBUG_INSN_P (insn))
24417 continue;
24418 /* Skip IMUL instruction. */
24419 insn2 = PATTERN (insn);
24420 if (GET_CODE (insn2) == PARALLEL)
24421 insn2 = XVECEXP (insn2, 0, 0);
24422 if (GET_CODE (insn2) == SET
24423 && GET_CODE (SET_SRC (insn2)) == MULT
24424 && GET_MODE (SET_SRC (insn2)) == SImode)
24425 continue;
24427 FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
24429 rtx con;
24430 con = DEP_CON (dep);
24431 if (!NONDEBUG_INSN_P (con))
24432 continue;
24433 insn1 = PATTERN (con);
24434 if (GET_CODE (insn1) == PARALLEL)
24435 insn1 = XVECEXP (insn1, 0, 0);
24437 if (GET_CODE (insn1) == SET
24438 && GET_CODE (SET_SRC (insn1)) == MULT
24439 && GET_MODE (SET_SRC (insn1)) == SImode)
24441 sd_iterator_def sd_it1;
24442 dep_t dep1;
24443 /* Check if there is no other dependee for IMUL. */
24444 index = i;
24445 FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
24447 rtx pro;
24448 pro = DEP_PRO (dep1);
24449 if (!NONDEBUG_INSN_P (pro))
24450 continue;
24451 if (pro != insn)
24452 index = -1;
24454 if (index >= 0)
24455 break;
24458 if (index >= 0)
24459 break;
24461 if (index < 0)
24462 return issue_rate; /* Didn't find IMUL producer. */
24464 if (sched_verbose > 1)
24465 fprintf(dump, ";;\tatom sched_reorder: swap %d and %d insns\n",
24466 INSN_UID (ready[index]), INSN_UID (ready[n_ready - 1]));
24468 /* Put IMUL producer (ready[index]) at the top of ready list. */
24469 insn1= ready[index];
24470 for (i = index; i < n_ready - 1; i++)
24471 ready[i] = ready[i + 1];
24472 ready[n_ready - 1] = insn1;
24474 return issue_rate;
24477 static bool
24478 ix86_class_likely_spilled_p (reg_class_t);
24480 /* Returns true if lhs of insn is HW function argument register and set up
24481 is_spilled to true if it is likely spilled HW register. */
24482 static bool
24483 insn_is_function_arg (rtx insn, bool* is_spilled)
24485 rtx dst;
24487 if (!NONDEBUG_INSN_P (insn))
24488 return false;
24489 /* Call instructions are not movable, ignore it. */
24490 if (CALL_P (insn))
24491 return false;
24492 insn = PATTERN (insn);
24493 if (GET_CODE (insn) == PARALLEL)
24494 insn = XVECEXP (insn, 0, 0);
24495 if (GET_CODE (insn) != SET)
24496 return false;
24497 dst = SET_DEST (insn);
24498 if (REG_P (dst) && HARD_REGISTER_P (dst)
24499 && ix86_function_arg_regno_p (REGNO (dst)))
24501 /* Is it likely spilled HW register? */
24502 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
24503 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
24504 *is_spilled = true;
24505 return true;
24507 return false;
24510 /* Add output dependencies for chain of function adjacent arguments if only
24511 there is a move to likely spilled HW register. Return first argument
24512 if at least one dependence was added or NULL otherwise. */
24513 static rtx
24514 add_parameter_dependencies (rtx call, rtx head)
24516 rtx insn;
24517 rtx last = call;
24518 rtx first_arg = NULL;
24519 bool is_spilled = false;
24521 head = PREV_INSN (head);
24523 /* Find nearest to call argument passing instruction. */
24524 while (true)
24526 last = PREV_INSN (last);
24527 if (last == head)
24528 return NULL;
24529 if (!NONDEBUG_INSN_P (last))
24530 continue;
24531 if (insn_is_function_arg (last, &is_spilled))
24532 break;
24533 return NULL;
24536 first_arg = last;
24537 while (true)
24539 insn = PREV_INSN (last);
24540 if (!INSN_P (insn))
24541 break;
24542 if (insn == head)
24543 break;
24544 if (!NONDEBUG_INSN_P (insn))
24546 last = insn;
24547 continue;
24549 if (insn_is_function_arg (insn, &is_spilled))
24551 /* Add output depdendence between two function arguments if chain
24552 of output arguments contains likely spilled HW registers. */
24553 if (is_spilled)
24554 add_dependence (last, insn, REG_DEP_OUTPUT);
24555 first_arg = last = insn;
24557 else
24558 break;
24560 if (!is_spilled)
24561 return NULL;
24562 return first_arg;
24565 /* Add output or anti dependency from insn to first_arg to restrict its code
24566 motion. */
24567 static void
24568 avoid_func_arg_motion (rtx first_arg, rtx insn)
24570 rtx set;
24571 rtx tmp;
24573 set = single_set (insn);
24574 if (!set)
24575 return;
24576 tmp = SET_DEST (set);
24577 if (REG_P (tmp))
24579 /* Add output dependency to the first function argument. */
24580 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
24581 return;
24583 /* Add anti dependency. */
24584 add_dependence (first_arg, insn, REG_DEP_ANTI);
24587 /* Avoid cross block motion of function argument through adding dependency
24588 from the first non-jump instruction in bb. */
24589 static void
24590 add_dependee_for_func_arg (rtx arg, basic_block bb)
24592 rtx insn = BB_END (bb);
24594 while (insn)
24596 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
24598 rtx set = single_set (insn);
24599 if (set)
24601 avoid_func_arg_motion (arg, insn);
24602 return;
24605 if (insn == BB_HEAD (bb))
24606 return;
24607 insn = PREV_INSN (insn);
24611 /* Hook for pre-reload schedule - avoid motion of function arguments
24612 passed in likely spilled HW registers. */
24613 static void
24614 ix86_dependencies_evaluation_hook (rtx head, rtx tail)
24616 rtx insn;
24617 rtx first_arg = NULL;
24618 if (reload_completed)
24619 return;
24620 while (head != tail && DEBUG_INSN_P (head))
24621 head = NEXT_INSN (head);
24622 for (insn = tail; insn != head; insn = PREV_INSN (insn))
24623 if (INSN_P (insn) && CALL_P (insn))
24625 first_arg = add_parameter_dependencies (insn, head);
24626 if (first_arg)
24628 /* Add dependee for first argument to predecessors if only
24629 region contains more than one block. */
24630 basic_block bb = BLOCK_FOR_INSN (insn);
24631 int rgn = CONTAINING_RGN (bb->index);
24632 int nr_blks = RGN_NR_BLOCKS (rgn);
24633 /* Skip trivial regions and region head blocks that can have
24634 predecessors outside of region. */
24635 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
24637 edge e;
24638 edge_iterator ei;
24639 /* Assume that region is SCC, i.e. all immediate predecessors
24640 of non-head block are in the same region. */
24641 FOR_EACH_EDGE (e, ei, bb->preds)
24643 /* Avoid creating of loop-carried dependencies through
24644 using topological odering in region. */
24645 if (BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
24646 add_dependee_for_func_arg (first_arg, e->src);
24649 insn = first_arg;
24650 if (insn == head)
24651 break;
24654 else if (first_arg)
24655 avoid_func_arg_motion (first_arg, insn);
24658 /* Hook for pre-reload schedule - set priority of moves from likely spilled
24659 HW registers to maximum, to schedule them at soon as possible. These are
24660 moves from function argument registers at the top of the function entry
24661 and moves from function return value registers after call. */
24662 static int
24663 ix86_adjust_priority (rtx insn, int priority)
24665 rtx set;
24667 if (reload_completed)
24668 return priority;
24670 if (!NONDEBUG_INSN_P (insn))
24671 return priority;
24673 set = single_set (insn);
24674 if (set)
24676 rtx tmp = SET_SRC (set);
24677 if (REG_P (tmp)
24678 && HARD_REGISTER_P (tmp)
24679 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
24680 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
24681 return current_sched_info->sched_max_insns_priority;
24684 return priority;
24687 /* Model decoder of Core 2/i7.
24688 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
24689 track the instruction fetch block boundaries and make sure that long
24690 (9+ bytes) instructions are assigned to D0. */
24692 /* Maximum length of an insn that can be handled by
24693 a secondary decoder unit. '8' for Core 2/i7. */
24694 static int core2i7_secondary_decoder_max_insn_size;
24696 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
24697 '16' for Core 2/i7. */
24698 static int core2i7_ifetch_block_size;
24700 /* Maximum number of instructions decoder can handle per cycle.
24701 '6' for Core 2/i7. */
24702 static int core2i7_ifetch_block_max_insns;
24704 typedef struct ix86_first_cycle_multipass_data_ *
24705 ix86_first_cycle_multipass_data_t;
24706 typedef const struct ix86_first_cycle_multipass_data_ *
24707 const_ix86_first_cycle_multipass_data_t;
24709 /* A variable to store target state across calls to max_issue within
24710 one cycle. */
24711 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
24712 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
24714 /* Initialize DATA. */
24715 static void
24716 core2i7_first_cycle_multipass_init (void *_data)
24718 ix86_first_cycle_multipass_data_t data
24719 = (ix86_first_cycle_multipass_data_t) _data;
24721 data->ifetch_block_len = 0;
24722 data->ifetch_block_n_insns = 0;
24723 data->ready_try_change = NULL;
24724 data->ready_try_change_size = 0;
24727 /* Advancing the cycle; reset ifetch block counts. */
24728 static void
24729 core2i7_dfa_post_advance_cycle (void)
24731 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
24733 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
24735 data->ifetch_block_len = 0;
24736 data->ifetch_block_n_insns = 0;
24739 static int min_insn_size (rtx);
24741 /* Filter out insns from ready_try that the core will not be able to issue
24742 on current cycle due to decoder. */
24743 static void
24744 core2i7_first_cycle_multipass_filter_ready_try
24745 (const_ix86_first_cycle_multipass_data_t data,
24746 char *ready_try, int n_ready, bool first_cycle_insn_p)
24748 while (n_ready--)
24750 rtx insn;
24751 int insn_size;
24753 if (ready_try[n_ready])
24754 continue;
24756 insn = get_ready_element (n_ready);
24757 insn_size = min_insn_size (insn);
24759 if (/* If this is a too long an insn for a secondary decoder ... */
24760 (!first_cycle_insn_p
24761 && insn_size > core2i7_secondary_decoder_max_insn_size)
24762 /* ... or it would not fit into the ifetch block ... */
24763 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
24764 /* ... or the decoder is full already ... */
24765 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
24766 /* ... mask the insn out. */
24768 ready_try[n_ready] = 1;
24770 if (data->ready_try_change)
24771 bitmap_set_bit (data->ready_try_change, n_ready);
24776 /* Prepare for a new round of multipass lookahead scheduling. */
24777 static void
24778 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
24779 bool first_cycle_insn_p)
24781 ix86_first_cycle_multipass_data_t data
24782 = (ix86_first_cycle_multipass_data_t) _data;
24783 const_ix86_first_cycle_multipass_data_t prev_data
24784 = ix86_first_cycle_multipass_data;
24786 /* Restore the state from the end of the previous round. */
24787 data->ifetch_block_len = prev_data->ifetch_block_len;
24788 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
24790 /* Filter instructions that cannot be issued on current cycle due to
24791 decoder restrictions. */
24792 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24793 first_cycle_insn_p);
24796 /* INSN is being issued in current solution. Account for its impact on
24797 the decoder model. */
24798 static void
24799 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
24800 rtx insn, const void *_prev_data)
24802 ix86_first_cycle_multipass_data_t data
24803 = (ix86_first_cycle_multipass_data_t) _data;
24804 const_ix86_first_cycle_multipass_data_t prev_data
24805 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
24807 int insn_size = min_insn_size (insn);
24809 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
24810 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
24811 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
24812 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
24814 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
24815 if (!data->ready_try_change)
24817 data->ready_try_change = sbitmap_alloc (n_ready);
24818 data->ready_try_change_size = n_ready;
24820 else if (data->ready_try_change_size < n_ready)
24822 data->ready_try_change = sbitmap_resize (data->ready_try_change,
24823 n_ready, 0);
24824 data->ready_try_change_size = n_ready;
24826 bitmap_clear (data->ready_try_change);
24828 /* Filter out insns from ready_try that the core will not be able to issue
24829 on current cycle due to decoder. */
24830 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24831 false);
24834 /* Revert the effect on ready_try. */
24835 static void
24836 core2i7_first_cycle_multipass_backtrack (const void *_data,
24837 char *ready_try,
24838 int n_ready ATTRIBUTE_UNUSED)
24840 const_ix86_first_cycle_multipass_data_t data
24841 = (const_ix86_first_cycle_multipass_data_t) _data;
24842 unsigned int i = 0;
24843 sbitmap_iterator sbi;
24845 gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready);
24846 EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi)
24848 ready_try[i] = 0;
24852 /* Save the result of multipass lookahead scheduling for the next round. */
24853 static void
24854 core2i7_first_cycle_multipass_end (const void *_data)
24856 const_ix86_first_cycle_multipass_data_t data
24857 = (const_ix86_first_cycle_multipass_data_t) _data;
24858 ix86_first_cycle_multipass_data_t next_data
24859 = ix86_first_cycle_multipass_data;
24861 if (data != NULL)
24863 next_data->ifetch_block_len = data->ifetch_block_len;
24864 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
24868 /* Deallocate target data. */
24869 static void
24870 core2i7_first_cycle_multipass_fini (void *_data)
24872 ix86_first_cycle_multipass_data_t data
24873 = (ix86_first_cycle_multipass_data_t) _data;
24875 if (data->ready_try_change)
24877 sbitmap_free (data->ready_try_change);
24878 data->ready_try_change = NULL;
24879 data->ready_try_change_size = 0;
24883 /* Prepare for scheduling pass. */
24884 static void
24885 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
24886 int verbose ATTRIBUTE_UNUSED,
24887 int max_uid ATTRIBUTE_UNUSED)
24889 /* Install scheduling hooks for current CPU. Some of these hooks are used
24890 in time-critical parts of the scheduler, so we only set them up when
24891 they are actually used. */
24892 switch (ix86_tune)
24894 case PROCESSOR_CORE2:
24895 case PROCESSOR_COREI7:
24896 case PROCESSOR_HASWELL:
24897 /* Do not perform multipass scheduling for pre-reload schedule
24898 to save compile time. */
24899 if (reload_completed)
24901 targetm.sched.dfa_post_advance_cycle
24902 = core2i7_dfa_post_advance_cycle;
24903 targetm.sched.first_cycle_multipass_init
24904 = core2i7_first_cycle_multipass_init;
24905 targetm.sched.first_cycle_multipass_begin
24906 = core2i7_first_cycle_multipass_begin;
24907 targetm.sched.first_cycle_multipass_issue
24908 = core2i7_first_cycle_multipass_issue;
24909 targetm.sched.first_cycle_multipass_backtrack
24910 = core2i7_first_cycle_multipass_backtrack;
24911 targetm.sched.first_cycle_multipass_end
24912 = core2i7_first_cycle_multipass_end;
24913 targetm.sched.first_cycle_multipass_fini
24914 = core2i7_first_cycle_multipass_fini;
24916 /* Set decoder parameters. */
24917 core2i7_secondary_decoder_max_insn_size = 8;
24918 core2i7_ifetch_block_size = 16;
24919 core2i7_ifetch_block_max_insns = 6;
24920 break;
24922 /* ... Fall through ... */
24923 default:
24924 targetm.sched.dfa_post_advance_cycle = NULL;
24925 targetm.sched.first_cycle_multipass_init = NULL;
24926 targetm.sched.first_cycle_multipass_begin = NULL;
24927 targetm.sched.first_cycle_multipass_issue = NULL;
24928 targetm.sched.first_cycle_multipass_backtrack = NULL;
24929 targetm.sched.first_cycle_multipass_end = NULL;
24930 targetm.sched.first_cycle_multipass_fini = NULL;
24931 break;
24936 /* Compute the alignment given to a constant that is being placed in memory.
24937 EXP is the constant and ALIGN is the alignment that the object would
24938 ordinarily have.
24939 The value of this function is used instead of that alignment to align
24940 the object. */
24943 ix86_constant_alignment (tree exp, int align)
24945 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
24946 || TREE_CODE (exp) == INTEGER_CST)
24948 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
24949 return 64;
24950 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
24951 return 128;
24953 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
24954 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
24955 return BITS_PER_WORD;
24957 return align;
24960 /* Compute the alignment for a static variable.
24961 TYPE is the data type, and ALIGN is the alignment that
24962 the object would ordinarily have. The value of this function is used
24963 instead of that alignment to align the object. */
24966 ix86_data_alignment (tree type, int align)
24968 int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
24970 if (AGGREGATE_TYPE_P (type)
24971 && TYPE_SIZE (type)
24972 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24973 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
24974 || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
24975 && align < max_align)
24976 align = max_align;
24978 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
24979 to 16byte boundary. */
24980 if (TARGET_64BIT)
24982 if (AGGREGATE_TYPE_P (type)
24983 && TYPE_SIZE (type)
24984 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24985 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
24986 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24987 return 128;
24990 if (TREE_CODE (type) == ARRAY_TYPE)
24992 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24993 return 64;
24994 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
24995 return 128;
24997 else if (TREE_CODE (type) == COMPLEX_TYPE)
25000 if (TYPE_MODE (type) == DCmode && align < 64)
25001 return 64;
25002 if ((TYPE_MODE (type) == XCmode
25003 || TYPE_MODE (type) == TCmode) && align < 128)
25004 return 128;
25006 else if ((TREE_CODE (type) == RECORD_TYPE
25007 || TREE_CODE (type) == UNION_TYPE
25008 || TREE_CODE (type) == QUAL_UNION_TYPE)
25009 && TYPE_FIELDS (type))
25011 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
25012 return 64;
25013 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
25014 return 128;
25016 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
25017 || TREE_CODE (type) == INTEGER_TYPE)
25019 if (TYPE_MODE (type) == DFmode && align < 64)
25020 return 64;
25021 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
25022 return 128;
25025 return align;
25028 /* Compute the alignment for a local variable or a stack slot. EXP is
25029 the data type or decl itself, MODE is the widest mode available and
25030 ALIGN is the alignment that the object would ordinarily have. The
25031 value of this macro is used instead of that alignment to align the
25032 object. */
25034 unsigned int
25035 ix86_local_alignment (tree exp, enum machine_mode mode,
25036 unsigned int align)
25038 tree type, decl;
25040 if (exp && DECL_P (exp))
25042 type = TREE_TYPE (exp);
25043 decl = exp;
25045 else
25047 type = exp;
25048 decl = NULL;
25051 /* Don't do dynamic stack realignment for long long objects with
25052 -mpreferred-stack-boundary=2. */
25053 if (!TARGET_64BIT
25054 && align == 64
25055 && ix86_preferred_stack_boundary < 64
25056 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
25057 && (!type || !TYPE_USER_ALIGN (type))
25058 && (!decl || !DECL_USER_ALIGN (decl)))
25059 align = 32;
25061 /* If TYPE is NULL, we are allocating a stack slot for caller-save
25062 register in MODE. We will return the largest alignment of XF
25063 and DF. */
25064 if (!type)
25066 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
25067 align = GET_MODE_ALIGNMENT (DFmode);
25068 return align;
25071 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
25072 to 16byte boundary. Exact wording is:
25074 An array uses the same alignment as its elements, except that a local or
25075 global array variable of length at least 16 bytes or
25076 a C99 variable-length array variable always has alignment of at least 16 bytes.
25078 This was added to allow use of aligned SSE instructions at arrays. This
25079 rule is meant for static storage (where compiler can not do the analysis
25080 by itself). We follow it for automatic variables only when convenient.
25081 We fully control everything in the function compiled and functions from
25082 other unit can not rely on the alignment.
25084 Exclude va_list type. It is the common case of local array where
25085 we can not benefit from the alignment. */
25086 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
25087 && TARGET_SSE)
25089 if (AGGREGATE_TYPE_P (type)
25090 && (va_list_type_node == NULL_TREE
25091 || (TYPE_MAIN_VARIANT (type)
25092 != TYPE_MAIN_VARIANT (va_list_type_node)))
25093 && TYPE_SIZE (type)
25094 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
25095 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
25096 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
25097 return 128;
25099 if (TREE_CODE (type) == ARRAY_TYPE)
25101 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
25102 return 64;
25103 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
25104 return 128;
25106 else if (TREE_CODE (type) == COMPLEX_TYPE)
25108 if (TYPE_MODE (type) == DCmode && align < 64)
25109 return 64;
25110 if ((TYPE_MODE (type) == XCmode
25111 || TYPE_MODE (type) == TCmode) && align < 128)
25112 return 128;
25114 else if ((TREE_CODE (type) == RECORD_TYPE
25115 || TREE_CODE (type) == UNION_TYPE
25116 || TREE_CODE (type) == QUAL_UNION_TYPE)
25117 && TYPE_FIELDS (type))
25119 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
25120 return 64;
25121 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
25122 return 128;
25124 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
25125 || TREE_CODE (type) == INTEGER_TYPE)
25128 if (TYPE_MODE (type) == DFmode && align < 64)
25129 return 64;
25130 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
25131 return 128;
25133 return align;
25136 /* Compute the minimum required alignment for dynamic stack realignment
25137 purposes for a local variable, parameter or a stack slot. EXP is
25138 the data type or decl itself, MODE is its mode and ALIGN is the
25139 alignment that the object would ordinarily have. */
25141 unsigned int
25142 ix86_minimum_alignment (tree exp, enum machine_mode mode,
25143 unsigned int align)
25145 tree type, decl;
25147 if (exp && DECL_P (exp))
25149 type = TREE_TYPE (exp);
25150 decl = exp;
25152 else
25154 type = exp;
25155 decl = NULL;
25158 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
25159 return align;
25161 /* Don't do dynamic stack realignment for long long objects with
25162 -mpreferred-stack-boundary=2. */
25163 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
25164 && (!type || !TYPE_USER_ALIGN (type))
25165 && (!decl || !DECL_USER_ALIGN (decl)))
25166 return 32;
25168 return align;
25171 /* Find a location for the static chain incoming to a nested function.
25172 This is a register, unless all free registers are used by arguments. */
25174 static rtx
25175 ix86_static_chain (const_tree fndecl, bool incoming_p)
25177 unsigned regno;
25179 if (!DECL_STATIC_CHAIN (fndecl))
25180 return NULL;
25182 if (TARGET_64BIT)
25184 /* We always use R10 in 64-bit mode. */
25185 regno = R10_REG;
25187 else
25189 tree fntype;
25190 unsigned int ccvt;
25192 /* By default in 32-bit mode we use ECX to pass the static chain. */
25193 regno = CX_REG;
25195 fntype = TREE_TYPE (fndecl);
25196 ccvt = ix86_get_callcvt (fntype);
25197 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
25199 /* Fastcall functions use ecx/edx for arguments, which leaves
25200 us with EAX for the static chain.
25201 Thiscall functions use ecx for arguments, which also
25202 leaves us with EAX for the static chain. */
25203 regno = AX_REG;
25205 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
25207 /* Thiscall functions use ecx for arguments, which leaves
25208 us with EAX and EDX for the static chain.
25209 We are using for abi-compatibility EAX. */
25210 regno = AX_REG;
25212 else if (ix86_function_regparm (fntype, fndecl) == 3)
25214 /* For regparm 3, we have no free call-clobbered registers in
25215 which to store the static chain. In order to implement this,
25216 we have the trampoline push the static chain to the stack.
25217 However, we can't push a value below the return address when
25218 we call the nested function directly, so we have to use an
25219 alternate entry point. For this we use ESI, and have the
25220 alternate entry point push ESI, so that things appear the
25221 same once we're executing the nested function. */
25222 if (incoming_p)
25224 if (fndecl == current_function_decl)
25225 ix86_static_chain_on_stack = true;
25226 return gen_frame_mem (SImode,
25227 plus_constant (Pmode,
25228 arg_pointer_rtx, -8));
25230 regno = SI_REG;
25234 return gen_rtx_REG (Pmode, regno);
25237 /* Emit RTL insns to initialize the variable parts of a trampoline.
25238 FNDECL is the decl of the target address; M_TRAMP is a MEM for
25239 the trampoline, and CHAIN_VALUE is an RTX for the static chain
25240 to be passed to the target function. */
25242 static void
25243 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
25245 rtx mem, fnaddr;
25246 int opcode;
25247 int offset = 0;
25249 fnaddr = XEXP (DECL_RTL (fndecl), 0);
25251 if (TARGET_64BIT)
25253 int size;
25255 /* Load the function address to r11. Try to load address using
25256 the shorter movl instead of movabs. We may want to support
25257 movq for kernel mode, but kernel does not use trampolines at
25258 the moment. FNADDR is a 32bit address and may not be in
25259 DImode when ptr_mode == SImode. Always use movl in this
25260 case. */
25261 if (ptr_mode == SImode
25262 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
25264 fnaddr = copy_addr_to_reg (fnaddr);
25266 mem = adjust_address (m_tramp, HImode, offset);
25267 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
25269 mem = adjust_address (m_tramp, SImode, offset + 2);
25270 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
25271 offset += 6;
25273 else
25275 mem = adjust_address (m_tramp, HImode, offset);
25276 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
25278 mem = adjust_address (m_tramp, DImode, offset + 2);
25279 emit_move_insn (mem, fnaddr);
25280 offset += 10;
25283 /* Load static chain using movabs to r10. Use the shorter movl
25284 instead of movabs when ptr_mode == SImode. */
25285 if (ptr_mode == SImode)
25287 opcode = 0xba41;
25288 size = 6;
25290 else
25292 opcode = 0xba49;
25293 size = 10;
25296 mem = adjust_address (m_tramp, HImode, offset);
25297 emit_move_insn (mem, gen_int_mode (opcode, HImode));
25299 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
25300 emit_move_insn (mem, chain_value);
25301 offset += size;
25303 /* Jump to r11; the last (unused) byte is a nop, only there to
25304 pad the write out to a single 32-bit store. */
25305 mem = adjust_address (m_tramp, SImode, offset);
25306 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
25307 offset += 4;
25309 else
25311 rtx disp, chain;
25313 /* Depending on the static chain location, either load a register
25314 with a constant, or push the constant to the stack. All of the
25315 instructions are the same size. */
25316 chain = ix86_static_chain (fndecl, true);
25317 if (REG_P (chain))
25319 switch (REGNO (chain))
25321 case AX_REG:
25322 opcode = 0xb8; break;
25323 case CX_REG:
25324 opcode = 0xb9; break;
25325 default:
25326 gcc_unreachable ();
25329 else
25330 opcode = 0x68;
25332 mem = adjust_address (m_tramp, QImode, offset);
25333 emit_move_insn (mem, gen_int_mode (opcode, QImode));
25335 mem = adjust_address (m_tramp, SImode, offset + 1);
25336 emit_move_insn (mem, chain_value);
25337 offset += 5;
25339 mem = adjust_address (m_tramp, QImode, offset);
25340 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
25342 mem = adjust_address (m_tramp, SImode, offset + 1);
25344 /* Compute offset from the end of the jmp to the target function.
25345 In the case in which the trampoline stores the static chain on
25346 the stack, we need to skip the first insn which pushes the
25347 (call-saved) register static chain; this push is 1 byte. */
25348 offset += 5;
25349 disp = expand_binop (SImode, sub_optab, fnaddr,
25350 plus_constant (Pmode, XEXP (m_tramp, 0),
25351 offset - (MEM_P (chain) ? 1 : 0)),
25352 NULL_RTX, 1, OPTAB_DIRECT);
25353 emit_move_insn (mem, disp);
25356 gcc_assert (offset <= TRAMPOLINE_SIZE);
25358 #ifdef HAVE_ENABLE_EXECUTE_STACK
25359 #ifdef CHECK_EXECUTE_STACK_ENABLED
25360 if (CHECK_EXECUTE_STACK_ENABLED)
25361 #endif
25362 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
25363 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
25364 #endif
25367 /* The following file contains several enumerations and data structures
25368 built from the definitions in i386-builtin-types.def. */
25370 #include "i386-builtin-types.inc"
25372 /* Table for the ix86 builtin non-function types. */
25373 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
25375 /* Retrieve an element from the above table, building some of
25376 the types lazily. */
25378 static tree
25379 ix86_get_builtin_type (enum ix86_builtin_type tcode)
25381 unsigned int index;
25382 tree type, itype;
25384 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
25386 type = ix86_builtin_type_tab[(int) tcode];
25387 if (type != NULL)
25388 return type;
25390 gcc_assert (tcode > IX86_BT_LAST_PRIM);
25391 if (tcode <= IX86_BT_LAST_VECT)
25393 enum machine_mode mode;
25395 index = tcode - IX86_BT_LAST_PRIM - 1;
25396 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
25397 mode = ix86_builtin_type_vect_mode[index];
25399 type = build_vector_type_for_mode (itype, mode);
25401 else
25403 int quals;
25405 index = tcode - IX86_BT_LAST_VECT - 1;
25406 if (tcode <= IX86_BT_LAST_PTR)
25407 quals = TYPE_UNQUALIFIED;
25408 else
25409 quals = TYPE_QUAL_CONST;
25411 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
25412 if (quals != TYPE_UNQUALIFIED)
25413 itype = build_qualified_type (itype, quals);
25415 type = build_pointer_type (itype);
25418 ix86_builtin_type_tab[(int) tcode] = type;
25419 return type;
25422 /* Table for the ix86 builtin function types. */
25423 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
25425 /* Retrieve an element from the above table, building some of
25426 the types lazily. */
25428 static tree
25429 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
25431 tree type;
25433 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
25435 type = ix86_builtin_func_type_tab[(int) tcode];
25436 if (type != NULL)
25437 return type;
25439 if (tcode <= IX86_BT_LAST_FUNC)
25441 unsigned start = ix86_builtin_func_start[(int) tcode];
25442 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
25443 tree rtype, atype, args = void_list_node;
25444 unsigned i;
25446 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
25447 for (i = after - 1; i > start; --i)
25449 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
25450 args = tree_cons (NULL, atype, args);
25453 type = build_function_type (rtype, args);
25455 else
25457 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
25458 enum ix86_builtin_func_type icode;
25460 icode = ix86_builtin_func_alias_base[index];
25461 type = ix86_get_builtin_func_type (icode);
25464 ix86_builtin_func_type_tab[(int) tcode] = type;
25465 return type;
25469 /* Codes for all the SSE/MMX builtins. */
25470 enum ix86_builtins
25472 IX86_BUILTIN_ADDPS,
25473 IX86_BUILTIN_ADDSS,
25474 IX86_BUILTIN_DIVPS,
25475 IX86_BUILTIN_DIVSS,
25476 IX86_BUILTIN_MULPS,
25477 IX86_BUILTIN_MULSS,
25478 IX86_BUILTIN_SUBPS,
25479 IX86_BUILTIN_SUBSS,
25481 IX86_BUILTIN_CMPEQPS,
25482 IX86_BUILTIN_CMPLTPS,
25483 IX86_BUILTIN_CMPLEPS,
25484 IX86_BUILTIN_CMPGTPS,
25485 IX86_BUILTIN_CMPGEPS,
25486 IX86_BUILTIN_CMPNEQPS,
25487 IX86_BUILTIN_CMPNLTPS,
25488 IX86_BUILTIN_CMPNLEPS,
25489 IX86_BUILTIN_CMPNGTPS,
25490 IX86_BUILTIN_CMPNGEPS,
25491 IX86_BUILTIN_CMPORDPS,
25492 IX86_BUILTIN_CMPUNORDPS,
25493 IX86_BUILTIN_CMPEQSS,
25494 IX86_BUILTIN_CMPLTSS,
25495 IX86_BUILTIN_CMPLESS,
25496 IX86_BUILTIN_CMPNEQSS,
25497 IX86_BUILTIN_CMPNLTSS,
25498 IX86_BUILTIN_CMPNLESS,
25499 IX86_BUILTIN_CMPNGTSS,
25500 IX86_BUILTIN_CMPNGESS,
25501 IX86_BUILTIN_CMPORDSS,
25502 IX86_BUILTIN_CMPUNORDSS,
25504 IX86_BUILTIN_COMIEQSS,
25505 IX86_BUILTIN_COMILTSS,
25506 IX86_BUILTIN_COMILESS,
25507 IX86_BUILTIN_COMIGTSS,
25508 IX86_BUILTIN_COMIGESS,
25509 IX86_BUILTIN_COMINEQSS,
25510 IX86_BUILTIN_UCOMIEQSS,
25511 IX86_BUILTIN_UCOMILTSS,
25512 IX86_BUILTIN_UCOMILESS,
25513 IX86_BUILTIN_UCOMIGTSS,
25514 IX86_BUILTIN_UCOMIGESS,
25515 IX86_BUILTIN_UCOMINEQSS,
25517 IX86_BUILTIN_CVTPI2PS,
25518 IX86_BUILTIN_CVTPS2PI,
25519 IX86_BUILTIN_CVTSI2SS,
25520 IX86_BUILTIN_CVTSI642SS,
25521 IX86_BUILTIN_CVTSS2SI,
25522 IX86_BUILTIN_CVTSS2SI64,
25523 IX86_BUILTIN_CVTTPS2PI,
25524 IX86_BUILTIN_CVTTSS2SI,
25525 IX86_BUILTIN_CVTTSS2SI64,
25527 IX86_BUILTIN_MAXPS,
25528 IX86_BUILTIN_MAXSS,
25529 IX86_BUILTIN_MINPS,
25530 IX86_BUILTIN_MINSS,
25532 IX86_BUILTIN_LOADUPS,
25533 IX86_BUILTIN_STOREUPS,
25534 IX86_BUILTIN_MOVSS,
25536 IX86_BUILTIN_MOVHLPS,
25537 IX86_BUILTIN_MOVLHPS,
25538 IX86_BUILTIN_LOADHPS,
25539 IX86_BUILTIN_LOADLPS,
25540 IX86_BUILTIN_STOREHPS,
25541 IX86_BUILTIN_STORELPS,
25543 IX86_BUILTIN_MASKMOVQ,
25544 IX86_BUILTIN_MOVMSKPS,
25545 IX86_BUILTIN_PMOVMSKB,
25547 IX86_BUILTIN_MOVNTPS,
25548 IX86_BUILTIN_MOVNTQ,
25550 IX86_BUILTIN_LOADDQU,
25551 IX86_BUILTIN_STOREDQU,
25553 IX86_BUILTIN_PACKSSWB,
25554 IX86_BUILTIN_PACKSSDW,
25555 IX86_BUILTIN_PACKUSWB,
25557 IX86_BUILTIN_PADDB,
25558 IX86_BUILTIN_PADDW,
25559 IX86_BUILTIN_PADDD,
25560 IX86_BUILTIN_PADDQ,
25561 IX86_BUILTIN_PADDSB,
25562 IX86_BUILTIN_PADDSW,
25563 IX86_BUILTIN_PADDUSB,
25564 IX86_BUILTIN_PADDUSW,
25565 IX86_BUILTIN_PSUBB,
25566 IX86_BUILTIN_PSUBW,
25567 IX86_BUILTIN_PSUBD,
25568 IX86_BUILTIN_PSUBQ,
25569 IX86_BUILTIN_PSUBSB,
25570 IX86_BUILTIN_PSUBSW,
25571 IX86_BUILTIN_PSUBUSB,
25572 IX86_BUILTIN_PSUBUSW,
25574 IX86_BUILTIN_PAND,
25575 IX86_BUILTIN_PANDN,
25576 IX86_BUILTIN_POR,
25577 IX86_BUILTIN_PXOR,
25579 IX86_BUILTIN_PAVGB,
25580 IX86_BUILTIN_PAVGW,
25582 IX86_BUILTIN_PCMPEQB,
25583 IX86_BUILTIN_PCMPEQW,
25584 IX86_BUILTIN_PCMPEQD,
25585 IX86_BUILTIN_PCMPGTB,
25586 IX86_BUILTIN_PCMPGTW,
25587 IX86_BUILTIN_PCMPGTD,
25589 IX86_BUILTIN_PMADDWD,
25591 IX86_BUILTIN_PMAXSW,
25592 IX86_BUILTIN_PMAXUB,
25593 IX86_BUILTIN_PMINSW,
25594 IX86_BUILTIN_PMINUB,
25596 IX86_BUILTIN_PMULHUW,
25597 IX86_BUILTIN_PMULHW,
25598 IX86_BUILTIN_PMULLW,
25600 IX86_BUILTIN_PSADBW,
25601 IX86_BUILTIN_PSHUFW,
25603 IX86_BUILTIN_PSLLW,
25604 IX86_BUILTIN_PSLLD,
25605 IX86_BUILTIN_PSLLQ,
25606 IX86_BUILTIN_PSRAW,
25607 IX86_BUILTIN_PSRAD,
25608 IX86_BUILTIN_PSRLW,
25609 IX86_BUILTIN_PSRLD,
25610 IX86_BUILTIN_PSRLQ,
25611 IX86_BUILTIN_PSLLWI,
25612 IX86_BUILTIN_PSLLDI,
25613 IX86_BUILTIN_PSLLQI,
25614 IX86_BUILTIN_PSRAWI,
25615 IX86_BUILTIN_PSRADI,
25616 IX86_BUILTIN_PSRLWI,
25617 IX86_BUILTIN_PSRLDI,
25618 IX86_BUILTIN_PSRLQI,
25620 IX86_BUILTIN_PUNPCKHBW,
25621 IX86_BUILTIN_PUNPCKHWD,
25622 IX86_BUILTIN_PUNPCKHDQ,
25623 IX86_BUILTIN_PUNPCKLBW,
25624 IX86_BUILTIN_PUNPCKLWD,
25625 IX86_BUILTIN_PUNPCKLDQ,
25627 IX86_BUILTIN_SHUFPS,
25629 IX86_BUILTIN_RCPPS,
25630 IX86_BUILTIN_RCPSS,
25631 IX86_BUILTIN_RSQRTPS,
25632 IX86_BUILTIN_RSQRTPS_NR,
25633 IX86_BUILTIN_RSQRTSS,
25634 IX86_BUILTIN_RSQRTF,
25635 IX86_BUILTIN_SQRTPS,
25636 IX86_BUILTIN_SQRTPS_NR,
25637 IX86_BUILTIN_SQRTSS,
25639 IX86_BUILTIN_UNPCKHPS,
25640 IX86_BUILTIN_UNPCKLPS,
25642 IX86_BUILTIN_ANDPS,
25643 IX86_BUILTIN_ANDNPS,
25644 IX86_BUILTIN_ORPS,
25645 IX86_BUILTIN_XORPS,
25647 IX86_BUILTIN_EMMS,
25648 IX86_BUILTIN_LDMXCSR,
25649 IX86_BUILTIN_STMXCSR,
25650 IX86_BUILTIN_SFENCE,
25652 IX86_BUILTIN_FXSAVE,
25653 IX86_BUILTIN_FXRSTOR,
25654 IX86_BUILTIN_FXSAVE64,
25655 IX86_BUILTIN_FXRSTOR64,
25657 IX86_BUILTIN_XSAVE,
25658 IX86_BUILTIN_XRSTOR,
25659 IX86_BUILTIN_XSAVE64,
25660 IX86_BUILTIN_XRSTOR64,
25662 IX86_BUILTIN_XSAVEOPT,
25663 IX86_BUILTIN_XSAVEOPT64,
25665 /* 3DNow! Original */
25666 IX86_BUILTIN_FEMMS,
25667 IX86_BUILTIN_PAVGUSB,
25668 IX86_BUILTIN_PF2ID,
25669 IX86_BUILTIN_PFACC,
25670 IX86_BUILTIN_PFADD,
25671 IX86_BUILTIN_PFCMPEQ,
25672 IX86_BUILTIN_PFCMPGE,
25673 IX86_BUILTIN_PFCMPGT,
25674 IX86_BUILTIN_PFMAX,
25675 IX86_BUILTIN_PFMIN,
25676 IX86_BUILTIN_PFMUL,
25677 IX86_BUILTIN_PFRCP,
25678 IX86_BUILTIN_PFRCPIT1,
25679 IX86_BUILTIN_PFRCPIT2,
25680 IX86_BUILTIN_PFRSQIT1,
25681 IX86_BUILTIN_PFRSQRT,
25682 IX86_BUILTIN_PFSUB,
25683 IX86_BUILTIN_PFSUBR,
25684 IX86_BUILTIN_PI2FD,
25685 IX86_BUILTIN_PMULHRW,
25687 /* 3DNow! Athlon Extensions */
25688 IX86_BUILTIN_PF2IW,
25689 IX86_BUILTIN_PFNACC,
25690 IX86_BUILTIN_PFPNACC,
25691 IX86_BUILTIN_PI2FW,
25692 IX86_BUILTIN_PSWAPDSI,
25693 IX86_BUILTIN_PSWAPDSF,
25695 /* SSE2 */
25696 IX86_BUILTIN_ADDPD,
25697 IX86_BUILTIN_ADDSD,
25698 IX86_BUILTIN_DIVPD,
25699 IX86_BUILTIN_DIVSD,
25700 IX86_BUILTIN_MULPD,
25701 IX86_BUILTIN_MULSD,
25702 IX86_BUILTIN_SUBPD,
25703 IX86_BUILTIN_SUBSD,
25705 IX86_BUILTIN_CMPEQPD,
25706 IX86_BUILTIN_CMPLTPD,
25707 IX86_BUILTIN_CMPLEPD,
25708 IX86_BUILTIN_CMPGTPD,
25709 IX86_BUILTIN_CMPGEPD,
25710 IX86_BUILTIN_CMPNEQPD,
25711 IX86_BUILTIN_CMPNLTPD,
25712 IX86_BUILTIN_CMPNLEPD,
25713 IX86_BUILTIN_CMPNGTPD,
25714 IX86_BUILTIN_CMPNGEPD,
25715 IX86_BUILTIN_CMPORDPD,
25716 IX86_BUILTIN_CMPUNORDPD,
25717 IX86_BUILTIN_CMPEQSD,
25718 IX86_BUILTIN_CMPLTSD,
25719 IX86_BUILTIN_CMPLESD,
25720 IX86_BUILTIN_CMPNEQSD,
25721 IX86_BUILTIN_CMPNLTSD,
25722 IX86_BUILTIN_CMPNLESD,
25723 IX86_BUILTIN_CMPORDSD,
25724 IX86_BUILTIN_CMPUNORDSD,
25726 IX86_BUILTIN_COMIEQSD,
25727 IX86_BUILTIN_COMILTSD,
25728 IX86_BUILTIN_COMILESD,
25729 IX86_BUILTIN_COMIGTSD,
25730 IX86_BUILTIN_COMIGESD,
25731 IX86_BUILTIN_COMINEQSD,
25732 IX86_BUILTIN_UCOMIEQSD,
25733 IX86_BUILTIN_UCOMILTSD,
25734 IX86_BUILTIN_UCOMILESD,
25735 IX86_BUILTIN_UCOMIGTSD,
25736 IX86_BUILTIN_UCOMIGESD,
25737 IX86_BUILTIN_UCOMINEQSD,
25739 IX86_BUILTIN_MAXPD,
25740 IX86_BUILTIN_MAXSD,
25741 IX86_BUILTIN_MINPD,
25742 IX86_BUILTIN_MINSD,
25744 IX86_BUILTIN_ANDPD,
25745 IX86_BUILTIN_ANDNPD,
25746 IX86_BUILTIN_ORPD,
25747 IX86_BUILTIN_XORPD,
25749 IX86_BUILTIN_SQRTPD,
25750 IX86_BUILTIN_SQRTSD,
25752 IX86_BUILTIN_UNPCKHPD,
25753 IX86_BUILTIN_UNPCKLPD,
25755 IX86_BUILTIN_SHUFPD,
25757 IX86_BUILTIN_LOADUPD,
25758 IX86_BUILTIN_STOREUPD,
25759 IX86_BUILTIN_MOVSD,
25761 IX86_BUILTIN_LOADHPD,
25762 IX86_BUILTIN_LOADLPD,
25764 IX86_BUILTIN_CVTDQ2PD,
25765 IX86_BUILTIN_CVTDQ2PS,
25767 IX86_BUILTIN_CVTPD2DQ,
25768 IX86_BUILTIN_CVTPD2PI,
25769 IX86_BUILTIN_CVTPD2PS,
25770 IX86_BUILTIN_CVTTPD2DQ,
25771 IX86_BUILTIN_CVTTPD2PI,
25773 IX86_BUILTIN_CVTPI2PD,
25774 IX86_BUILTIN_CVTSI2SD,
25775 IX86_BUILTIN_CVTSI642SD,
25777 IX86_BUILTIN_CVTSD2SI,
25778 IX86_BUILTIN_CVTSD2SI64,
25779 IX86_BUILTIN_CVTSD2SS,
25780 IX86_BUILTIN_CVTSS2SD,
25781 IX86_BUILTIN_CVTTSD2SI,
25782 IX86_BUILTIN_CVTTSD2SI64,
25784 IX86_BUILTIN_CVTPS2DQ,
25785 IX86_BUILTIN_CVTPS2PD,
25786 IX86_BUILTIN_CVTTPS2DQ,
25788 IX86_BUILTIN_MOVNTI,
25789 IX86_BUILTIN_MOVNTI64,
25790 IX86_BUILTIN_MOVNTPD,
25791 IX86_BUILTIN_MOVNTDQ,
25793 IX86_BUILTIN_MOVQ128,
25795 /* SSE2 MMX */
25796 IX86_BUILTIN_MASKMOVDQU,
25797 IX86_BUILTIN_MOVMSKPD,
25798 IX86_BUILTIN_PMOVMSKB128,
25800 IX86_BUILTIN_PACKSSWB128,
25801 IX86_BUILTIN_PACKSSDW128,
25802 IX86_BUILTIN_PACKUSWB128,
25804 IX86_BUILTIN_PADDB128,
25805 IX86_BUILTIN_PADDW128,
25806 IX86_BUILTIN_PADDD128,
25807 IX86_BUILTIN_PADDQ128,
25808 IX86_BUILTIN_PADDSB128,
25809 IX86_BUILTIN_PADDSW128,
25810 IX86_BUILTIN_PADDUSB128,
25811 IX86_BUILTIN_PADDUSW128,
25812 IX86_BUILTIN_PSUBB128,
25813 IX86_BUILTIN_PSUBW128,
25814 IX86_BUILTIN_PSUBD128,
25815 IX86_BUILTIN_PSUBQ128,
25816 IX86_BUILTIN_PSUBSB128,
25817 IX86_BUILTIN_PSUBSW128,
25818 IX86_BUILTIN_PSUBUSB128,
25819 IX86_BUILTIN_PSUBUSW128,
25821 IX86_BUILTIN_PAND128,
25822 IX86_BUILTIN_PANDN128,
25823 IX86_BUILTIN_POR128,
25824 IX86_BUILTIN_PXOR128,
25826 IX86_BUILTIN_PAVGB128,
25827 IX86_BUILTIN_PAVGW128,
25829 IX86_BUILTIN_PCMPEQB128,
25830 IX86_BUILTIN_PCMPEQW128,
25831 IX86_BUILTIN_PCMPEQD128,
25832 IX86_BUILTIN_PCMPGTB128,
25833 IX86_BUILTIN_PCMPGTW128,
25834 IX86_BUILTIN_PCMPGTD128,
25836 IX86_BUILTIN_PMADDWD128,
25838 IX86_BUILTIN_PMAXSW128,
25839 IX86_BUILTIN_PMAXUB128,
25840 IX86_BUILTIN_PMINSW128,
25841 IX86_BUILTIN_PMINUB128,
25843 IX86_BUILTIN_PMULUDQ,
25844 IX86_BUILTIN_PMULUDQ128,
25845 IX86_BUILTIN_PMULHUW128,
25846 IX86_BUILTIN_PMULHW128,
25847 IX86_BUILTIN_PMULLW128,
25849 IX86_BUILTIN_PSADBW128,
25850 IX86_BUILTIN_PSHUFHW,
25851 IX86_BUILTIN_PSHUFLW,
25852 IX86_BUILTIN_PSHUFD,
25854 IX86_BUILTIN_PSLLDQI128,
25855 IX86_BUILTIN_PSLLWI128,
25856 IX86_BUILTIN_PSLLDI128,
25857 IX86_BUILTIN_PSLLQI128,
25858 IX86_BUILTIN_PSRAWI128,
25859 IX86_BUILTIN_PSRADI128,
25860 IX86_BUILTIN_PSRLDQI128,
25861 IX86_BUILTIN_PSRLWI128,
25862 IX86_BUILTIN_PSRLDI128,
25863 IX86_BUILTIN_PSRLQI128,
25865 IX86_BUILTIN_PSLLDQ128,
25866 IX86_BUILTIN_PSLLW128,
25867 IX86_BUILTIN_PSLLD128,
25868 IX86_BUILTIN_PSLLQ128,
25869 IX86_BUILTIN_PSRAW128,
25870 IX86_BUILTIN_PSRAD128,
25871 IX86_BUILTIN_PSRLW128,
25872 IX86_BUILTIN_PSRLD128,
25873 IX86_BUILTIN_PSRLQ128,
25875 IX86_BUILTIN_PUNPCKHBW128,
25876 IX86_BUILTIN_PUNPCKHWD128,
25877 IX86_BUILTIN_PUNPCKHDQ128,
25878 IX86_BUILTIN_PUNPCKHQDQ128,
25879 IX86_BUILTIN_PUNPCKLBW128,
25880 IX86_BUILTIN_PUNPCKLWD128,
25881 IX86_BUILTIN_PUNPCKLDQ128,
25882 IX86_BUILTIN_PUNPCKLQDQ128,
25884 IX86_BUILTIN_CLFLUSH,
25885 IX86_BUILTIN_MFENCE,
25886 IX86_BUILTIN_LFENCE,
25887 IX86_BUILTIN_PAUSE,
25889 IX86_BUILTIN_BSRSI,
25890 IX86_BUILTIN_BSRDI,
25891 IX86_BUILTIN_RDPMC,
25892 IX86_BUILTIN_RDTSC,
25893 IX86_BUILTIN_RDTSCP,
25894 IX86_BUILTIN_ROLQI,
25895 IX86_BUILTIN_ROLHI,
25896 IX86_BUILTIN_RORQI,
25897 IX86_BUILTIN_RORHI,
25899 /* SSE3. */
25900 IX86_BUILTIN_ADDSUBPS,
25901 IX86_BUILTIN_HADDPS,
25902 IX86_BUILTIN_HSUBPS,
25903 IX86_BUILTIN_MOVSHDUP,
25904 IX86_BUILTIN_MOVSLDUP,
25905 IX86_BUILTIN_ADDSUBPD,
25906 IX86_BUILTIN_HADDPD,
25907 IX86_BUILTIN_HSUBPD,
25908 IX86_BUILTIN_LDDQU,
25910 IX86_BUILTIN_MONITOR,
25911 IX86_BUILTIN_MWAIT,
25913 /* SSSE3. */
25914 IX86_BUILTIN_PHADDW,
25915 IX86_BUILTIN_PHADDD,
25916 IX86_BUILTIN_PHADDSW,
25917 IX86_BUILTIN_PHSUBW,
25918 IX86_BUILTIN_PHSUBD,
25919 IX86_BUILTIN_PHSUBSW,
25920 IX86_BUILTIN_PMADDUBSW,
25921 IX86_BUILTIN_PMULHRSW,
25922 IX86_BUILTIN_PSHUFB,
25923 IX86_BUILTIN_PSIGNB,
25924 IX86_BUILTIN_PSIGNW,
25925 IX86_BUILTIN_PSIGND,
25926 IX86_BUILTIN_PALIGNR,
25927 IX86_BUILTIN_PABSB,
25928 IX86_BUILTIN_PABSW,
25929 IX86_BUILTIN_PABSD,
25931 IX86_BUILTIN_PHADDW128,
25932 IX86_BUILTIN_PHADDD128,
25933 IX86_BUILTIN_PHADDSW128,
25934 IX86_BUILTIN_PHSUBW128,
25935 IX86_BUILTIN_PHSUBD128,
25936 IX86_BUILTIN_PHSUBSW128,
25937 IX86_BUILTIN_PMADDUBSW128,
25938 IX86_BUILTIN_PMULHRSW128,
25939 IX86_BUILTIN_PSHUFB128,
25940 IX86_BUILTIN_PSIGNB128,
25941 IX86_BUILTIN_PSIGNW128,
25942 IX86_BUILTIN_PSIGND128,
25943 IX86_BUILTIN_PALIGNR128,
25944 IX86_BUILTIN_PABSB128,
25945 IX86_BUILTIN_PABSW128,
25946 IX86_BUILTIN_PABSD128,
25948 /* AMDFAM10 - SSE4A New Instructions. */
25949 IX86_BUILTIN_MOVNTSD,
25950 IX86_BUILTIN_MOVNTSS,
25951 IX86_BUILTIN_EXTRQI,
25952 IX86_BUILTIN_EXTRQ,
25953 IX86_BUILTIN_INSERTQI,
25954 IX86_BUILTIN_INSERTQ,
25956 /* SSE4.1. */
25957 IX86_BUILTIN_BLENDPD,
25958 IX86_BUILTIN_BLENDPS,
25959 IX86_BUILTIN_BLENDVPD,
25960 IX86_BUILTIN_BLENDVPS,
25961 IX86_BUILTIN_PBLENDVB128,
25962 IX86_BUILTIN_PBLENDW128,
25964 IX86_BUILTIN_DPPD,
25965 IX86_BUILTIN_DPPS,
25967 IX86_BUILTIN_INSERTPS128,
25969 IX86_BUILTIN_MOVNTDQA,
25970 IX86_BUILTIN_MPSADBW128,
25971 IX86_BUILTIN_PACKUSDW128,
25972 IX86_BUILTIN_PCMPEQQ,
25973 IX86_BUILTIN_PHMINPOSUW128,
25975 IX86_BUILTIN_PMAXSB128,
25976 IX86_BUILTIN_PMAXSD128,
25977 IX86_BUILTIN_PMAXUD128,
25978 IX86_BUILTIN_PMAXUW128,
25980 IX86_BUILTIN_PMINSB128,
25981 IX86_BUILTIN_PMINSD128,
25982 IX86_BUILTIN_PMINUD128,
25983 IX86_BUILTIN_PMINUW128,
25985 IX86_BUILTIN_PMOVSXBW128,
25986 IX86_BUILTIN_PMOVSXBD128,
25987 IX86_BUILTIN_PMOVSXBQ128,
25988 IX86_BUILTIN_PMOVSXWD128,
25989 IX86_BUILTIN_PMOVSXWQ128,
25990 IX86_BUILTIN_PMOVSXDQ128,
25992 IX86_BUILTIN_PMOVZXBW128,
25993 IX86_BUILTIN_PMOVZXBD128,
25994 IX86_BUILTIN_PMOVZXBQ128,
25995 IX86_BUILTIN_PMOVZXWD128,
25996 IX86_BUILTIN_PMOVZXWQ128,
25997 IX86_BUILTIN_PMOVZXDQ128,
25999 IX86_BUILTIN_PMULDQ128,
26000 IX86_BUILTIN_PMULLD128,
26002 IX86_BUILTIN_ROUNDSD,
26003 IX86_BUILTIN_ROUNDSS,
26005 IX86_BUILTIN_ROUNDPD,
26006 IX86_BUILTIN_ROUNDPS,
26008 IX86_BUILTIN_FLOORPD,
26009 IX86_BUILTIN_CEILPD,
26010 IX86_BUILTIN_TRUNCPD,
26011 IX86_BUILTIN_RINTPD,
26012 IX86_BUILTIN_ROUNDPD_AZ,
26014 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX,
26015 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX,
26016 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX,
26018 IX86_BUILTIN_FLOORPS,
26019 IX86_BUILTIN_CEILPS,
26020 IX86_BUILTIN_TRUNCPS,
26021 IX86_BUILTIN_RINTPS,
26022 IX86_BUILTIN_ROUNDPS_AZ,
26024 IX86_BUILTIN_FLOORPS_SFIX,
26025 IX86_BUILTIN_CEILPS_SFIX,
26026 IX86_BUILTIN_ROUNDPS_AZ_SFIX,
26028 IX86_BUILTIN_PTESTZ,
26029 IX86_BUILTIN_PTESTC,
26030 IX86_BUILTIN_PTESTNZC,
26032 IX86_BUILTIN_VEC_INIT_V2SI,
26033 IX86_BUILTIN_VEC_INIT_V4HI,
26034 IX86_BUILTIN_VEC_INIT_V8QI,
26035 IX86_BUILTIN_VEC_EXT_V2DF,
26036 IX86_BUILTIN_VEC_EXT_V2DI,
26037 IX86_BUILTIN_VEC_EXT_V4SF,
26038 IX86_BUILTIN_VEC_EXT_V4SI,
26039 IX86_BUILTIN_VEC_EXT_V8HI,
26040 IX86_BUILTIN_VEC_EXT_V2SI,
26041 IX86_BUILTIN_VEC_EXT_V4HI,
26042 IX86_BUILTIN_VEC_EXT_V16QI,
26043 IX86_BUILTIN_VEC_SET_V2DI,
26044 IX86_BUILTIN_VEC_SET_V4SF,
26045 IX86_BUILTIN_VEC_SET_V4SI,
26046 IX86_BUILTIN_VEC_SET_V8HI,
26047 IX86_BUILTIN_VEC_SET_V4HI,
26048 IX86_BUILTIN_VEC_SET_V16QI,
26050 IX86_BUILTIN_VEC_PACK_SFIX,
26051 IX86_BUILTIN_VEC_PACK_SFIX256,
26053 /* SSE4.2. */
26054 IX86_BUILTIN_CRC32QI,
26055 IX86_BUILTIN_CRC32HI,
26056 IX86_BUILTIN_CRC32SI,
26057 IX86_BUILTIN_CRC32DI,
26059 IX86_BUILTIN_PCMPESTRI128,
26060 IX86_BUILTIN_PCMPESTRM128,
26061 IX86_BUILTIN_PCMPESTRA128,
26062 IX86_BUILTIN_PCMPESTRC128,
26063 IX86_BUILTIN_PCMPESTRO128,
26064 IX86_BUILTIN_PCMPESTRS128,
26065 IX86_BUILTIN_PCMPESTRZ128,
26066 IX86_BUILTIN_PCMPISTRI128,
26067 IX86_BUILTIN_PCMPISTRM128,
26068 IX86_BUILTIN_PCMPISTRA128,
26069 IX86_BUILTIN_PCMPISTRC128,
26070 IX86_BUILTIN_PCMPISTRO128,
26071 IX86_BUILTIN_PCMPISTRS128,
26072 IX86_BUILTIN_PCMPISTRZ128,
26074 IX86_BUILTIN_PCMPGTQ,
26076 /* AES instructions */
26077 IX86_BUILTIN_AESENC128,
26078 IX86_BUILTIN_AESENCLAST128,
26079 IX86_BUILTIN_AESDEC128,
26080 IX86_BUILTIN_AESDECLAST128,
26081 IX86_BUILTIN_AESIMC128,
26082 IX86_BUILTIN_AESKEYGENASSIST128,
26084 /* PCLMUL instruction */
26085 IX86_BUILTIN_PCLMULQDQ128,
26087 /* AVX */
26088 IX86_BUILTIN_ADDPD256,
26089 IX86_BUILTIN_ADDPS256,
26090 IX86_BUILTIN_ADDSUBPD256,
26091 IX86_BUILTIN_ADDSUBPS256,
26092 IX86_BUILTIN_ANDPD256,
26093 IX86_BUILTIN_ANDPS256,
26094 IX86_BUILTIN_ANDNPD256,
26095 IX86_BUILTIN_ANDNPS256,
26096 IX86_BUILTIN_BLENDPD256,
26097 IX86_BUILTIN_BLENDPS256,
26098 IX86_BUILTIN_BLENDVPD256,
26099 IX86_BUILTIN_BLENDVPS256,
26100 IX86_BUILTIN_DIVPD256,
26101 IX86_BUILTIN_DIVPS256,
26102 IX86_BUILTIN_DPPS256,
26103 IX86_BUILTIN_HADDPD256,
26104 IX86_BUILTIN_HADDPS256,
26105 IX86_BUILTIN_HSUBPD256,
26106 IX86_BUILTIN_HSUBPS256,
26107 IX86_BUILTIN_MAXPD256,
26108 IX86_BUILTIN_MAXPS256,
26109 IX86_BUILTIN_MINPD256,
26110 IX86_BUILTIN_MINPS256,
26111 IX86_BUILTIN_MULPD256,
26112 IX86_BUILTIN_MULPS256,
26113 IX86_BUILTIN_ORPD256,
26114 IX86_BUILTIN_ORPS256,
26115 IX86_BUILTIN_SHUFPD256,
26116 IX86_BUILTIN_SHUFPS256,
26117 IX86_BUILTIN_SUBPD256,
26118 IX86_BUILTIN_SUBPS256,
26119 IX86_BUILTIN_XORPD256,
26120 IX86_BUILTIN_XORPS256,
26121 IX86_BUILTIN_CMPSD,
26122 IX86_BUILTIN_CMPSS,
26123 IX86_BUILTIN_CMPPD,
26124 IX86_BUILTIN_CMPPS,
26125 IX86_BUILTIN_CMPPD256,
26126 IX86_BUILTIN_CMPPS256,
26127 IX86_BUILTIN_CVTDQ2PD256,
26128 IX86_BUILTIN_CVTDQ2PS256,
26129 IX86_BUILTIN_CVTPD2PS256,
26130 IX86_BUILTIN_CVTPS2DQ256,
26131 IX86_BUILTIN_CVTPS2PD256,
26132 IX86_BUILTIN_CVTTPD2DQ256,
26133 IX86_BUILTIN_CVTPD2DQ256,
26134 IX86_BUILTIN_CVTTPS2DQ256,
26135 IX86_BUILTIN_EXTRACTF128PD256,
26136 IX86_BUILTIN_EXTRACTF128PS256,
26137 IX86_BUILTIN_EXTRACTF128SI256,
26138 IX86_BUILTIN_VZEROALL,
26139 IX86_BUILTIN_VZEROUPPER,
26140 IX86_BUILTIN_VPERMILVARPD,
26141 IX86_BUILTIN_VPERMILVARPS,
26142 IX86_BUILTIN_VPERMILVARPD256,
26143 IX86_BUILTIN_VPERMILVARPS256,
26144 IX86_BUILTIN_VPERMILPD,
26145 IX86_BUILTIN_VPERMILPS,
26146 IX86_BUILTIN_VPERMILPD256,
26147 IX86_BUILTIN_VPERMILPS256,
26148 IX86_BUILTIN_VPERMIL2PD,
26149 IX86_BUILTIN_VPERMIL2PS,
26150 IX86_BUILTIN_VPERMIL2PD256,
26151 IX86_BUILTIN_VPERMIL2PS256,
26152 IX86_BUILTIN_VPERM2F128PD256,
26153 IX86_BUILTIN_VPERM2F128PS256,
26154 IX86_BUILTIN_VPERM2F128SI256,
26155 IX86_BUILTIN_VBROADCASTSS,
26156 IX86_BUILTIN_VBROADCASTSD256,
26157 IX86_BUILTIN_VBROADCASTSS256,
26158 IX86_BUILTIN_VBROADCASTPD256,
26159 IX86_BUILTIN_VBROADCASTPS256,
26160 IX86_BUILTIN_VINSERTF128PD256,
26161 IX86_BUILTIN_VINSERTF128PS256,
26162 IX86_BUILTIN_VINSERTF128SI256,
26163 IX86_BUILTIN_LOADUPD256,
26164 IX86_BUILTIN_LOADUPS256,
26165 IX86_BUILTIN_STOREUPD256,
26166 IX86_BUILTIN_STOREUPS256,
26167 IX86_BUILTIN_LDDQU256,
26168 IX86_BUILTIN_MOVNTDQ256,
26169 IX86_BUILTIN_MOVNTPD256,
26170 IX86_BUILTIN_MOVNTPS256,
26171 IX86_BUILTIN_LOADDQU256,
26172 IX86_BUILTIN_STOREDQU256,
26173 IX86_BUILTIN_MASKLOADPD,
26174 IX86_BUILTIN_MASKLOADPS,
26175 IX86_BUILTIN_MASKSTOREPD,
26176 IX86_BUILTIN_MASKSTOREPS,
26177 IX86_BUILTIN_MASKLOADPD256,
26178 IX86_BUILTIN_MASKLOADPS256,
26179 IX86_BUILTIN_MASKSTOREPD256,
26180 IX86_BUILTIN_MASKSTOREPS256,
26181 IX86_BUILTIN_MOVSHDUP256,
26182 IX86_BUILTIN_MOVSLDUP256,
26183 IX86_BUILTIN_MOVDDUP256,
26185 IX86_BUILTIN_SQRTPD256,
26186 IX86_BUILTIN_SQRTPS256,
26187 IX86_BUILTIN_SQRTPS_NR256,
26188 IX86_BUILTIN_RSQRTPS256,
26189 IX86_BUILTIN_RSQRTPS_NR256,
26191 IX86_BUILTIN_RCPPS256,
26193 IX86_BUILTIN_ROUNDPD256,
26194 IX86_BUILTIN_ROUNDPS256,
26196 IX86_BUILTIN_FLOORPD256,
26197 IX86_BUILTIN_CEILPD256,
26198 IX86_BUILTIN_TRUNCPD256,
26199 IX86_BUILTIN_RINTPD256,
26200 IX86_BUILTIN_ROUNDPD_AZ256,
26202 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256,
26203 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256,
26204 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256,
26206 IX86_BUILTIN_FLOORPS256,
26207 IX86_BUILTIN_CEILPS256,
26208 IX86_BUILTIN_TRUNCPS256,
26209 IX86_BUILTIN_RINTPS256,
26210 IX86_BUILTIN_ROUNDPS_AZ256,
26212 IX86_BUILTIN_FLOORPS_SFIX256,
26213 IX86_BUILTIN_CEILPS_SFIX256,
26214 IX86_BUILTIN_ROUNDPS_AZ_SFIX256,
26216 IX86_BUILTIN_UNPCKHPD256,
26217 IX86_BUILTIN_UNPCKLPD256,
26218 IX86_BUILTIN_UNPCKHPS256,
26219 IX86_BUILTIN_UNPCKLPS256,
26221 IX86_BUILTIN_SI256_SI,
26222 IX86_BUILTIN_PS256_PS,
26223 IX86_BUILTIN_PD256_PD,
26224 IX86_BUILTIN_SI_SI256,
26225 IX86_BUILTIN_PS_PS256,
26226 IX86_BUILTIN_PD_PD256,
26228 IX86_BUILTIN_VTESTZPD,
26229 IX86_BUILTIN_VTESTCPD,
26230 IX86_BUILTIN_VTESTNZCPD,
26231 IX86_BUILTIN_VTESTZPS,
26232 IX86_BUILTIN_VTESTCPS,
26233 IX86_BUILTIN_VTESTNZCPS,
26234 IX86_BUILTIN_VTESTZPD256,
26235 IX86_BUILTIN_VTESTCPD256,
26236 IX86_BUILTIN_VTESTNZCPD256,
26237 IX86_BUILTIN_VTESTZPS256,
26238 IX86_BUILTIN_VTESTCPS256,
26239 IX86_BUILTIN_VTESTNZCPS256,
26240 IX86_BUILTIN_PTESTZ256,
26241 IX86_BUILTIN_PTESTC256,
26242 IX86_BUILTIN_PTESTNZC256,
26244 IX86_BUILTIN_MOVMSKPD256,
26245 IX86_BUILTIN_MOVMSKPS256,
26247 /* AVX2 */
26248 IX86_BUILTIN_MPSADBW256,
26249 IX86_BUILTIN_PABSB256,
26250 IX86_BUILTIN_PABSW256,
26251 IX86_BUILTIN_PABSD256,
26252 IX86_BUILTIN_PACKSSDW256,
26253 IX86_BUILTIN_PACKSSWB256,
26254 IX86_BUILTIN_PACKUSDW256,
26255 IX86_BUILTIN_PACKUSWB256,
26256 IX86_BUILTIN_PADDB256,
26257 IX86_BUILTIN_PADDW256,
26258 IX86_BUILTIN_PADDD256,
26259 IX86_BUILTIN_PADDQ256,
26260 IX86_BUILTIN_PADDSB256,
26261 IX86_BUILTIN_PADDSW256,
26262 IX86_BUILTIN_PADDUSB256,
26263 IX86_BUILTIN_PADDUSW256,
26264 IX86_BUILTIN_PALIGNR256,
26265 IX86_BUILTIN_AND256I,
26266 IX86_BUILTIN_ANDNOT256I,
26267 IX86_BUILTIN_PAVGB256,
26268 IX86_BUILTIN_PAVGW256,
26269 IX86_BUILTIN_PBLENDVB256,
26270 IX86_BUILTIN_PBLENDVW256,
26271 IX86_BUILTIN_PCMPEQB256,
26272 IX86_BUILTIN_PCMPEQW256,
26273 IX86_BUILTIN_PCMPEQD256,
26274 IX86_BUILTIN_PCMPEQQ256,
26275 IX86_BUILTIN_PCMPGTB256,
26276 IX86_BUILTIN_PCMPGTW256,
26277 IX86_BUILTIN_PCMPGTD256,
26278 IX86_BUILTIN_PCMPGTQ256,
26279 IX86_BUILTIN_PHADDW256,
26280 IX86_BUILTIN_PHADDD256,
26281 IX86_BUILTIN_PHADDSW256,
26282 IX86_BUILTIN_PHSUBW256,
26283 IX86_BUILTIN_PHSUBD256,
26284 IX86_BUILTIN_PHSUBSW256,
26285 IX86_BUILTIN_PMADDUBSW256,
26286 IX86_BUILTIN_PMADDWD256,
26287 IX86_BUILTIN_PMAXSB256,
26288 IX86_BUILTIN_PMAXSW256,
26289 IX86_BUILTIN_PMAXSD256,
26290 IX86_BUILTIN_PMAXUB256,
26291 IX86_BUILTIN_PMAXUW256,
26292 IX86_BUILTIN_PMAXUD256,
26293 IX86_BUILTIN_PMINSB256,
26294 IX86_BUILTIN_PMINSW256,
26295 IX86_BUILTIN_PMINSD256,
26296 IX86_BUILTIN_PMINUB256,
26297 IX86_BUILTIN_PMINUW256,
26298 IX86_BUILTIN_PMINUD256,
26299 IX86_BUILTIN_PMOVMSKB256,
26300 IX86_BUILTIN_PMOVSXBW256,
26301 IX86_BUILTIN_PMOVSXBD256,
26302 IX86_BUILTIN_PMOVSXBQ256,
26303 IX86_BUILTIN_PMOVSXWD256,
26304 IX86_BUILTIN_PMOVSXWQ256,
26305 IX86_BUILTIN_PMOVSXDQ256,
26306 IX86_BUILTIN_PMOVZXBW256,
26307 IX86_BUILTIN_PMOVZXBD256,
26308 IX86_BUILTIN_PMOVZXBQ256,
26309 IX86_BUILTIN_PMOVZXWD256,
26310 IX86_BUILTIN_PMOVZXWQ256,
26311 IX86_BUILTIN_PMOVZXDQ256,
26312 IX86_BUILTIN_PMULDQ256,
26313 IX86_BUILTIN_PMULHRSW256,
26314 IX86_BUILTIN_PMULHUW256,
26315 IX86_BUILTIN_PMULHW256,
26316 IX86_BUILTIN_PMULLW256,
26317 IX86_BUILTIN_PMULLD256,
26318 IX86_BUILTIN_PMULUDQ256,
26319 IX86_BUILTIN_POR256,
26320 IX86_BUILTIN_PSADBW256,
26321 IX86_BUILTIN_PSHUFB256,
26322 IX86_BUILTIN_PSHUFD256,
26323 IX86_BUILTIN_PSHUFHW256,
26324 IX86_BUILTIN_PSHUFLW256,
26325 IX86_BUILTIN_PSIGNB256,
26326 IX86_BUILTIN_PSIGNW256,
26327 IX86_BUILTIN_PSIGND256,
26328 IX86_BUILTIN_PSLLDQI256,
26329 IX86_BUILTIN_PSLLWI256,
26330 IX86_BUILTIN_PSLLW256,
26331 IX86_BUILTIN_PSLLDI256,
26332 IX86_BUILTIN_PSLLD256,
26333 IX86_BUILTIN_PSLLQI256,
26334 IX86_BUILTIN_PSLLQ256,
26335 IX86_BUILTIN_PSRAWI256,
26336 IX86_BUILTIN_PSRAW256,
26337 IX86_BUILTIN_PSRADI256,
26338 IX86_BUILTIN_PSRAD256,
26339 IX86_BUILTIN_PSRLDQI256,
26340 IX86_BUILTIN_PSRLWI256,
26341 IX86_BUILTIN_PSRLW256,
26342 IX86_BUILTIN_PSRLDI256,
26343 IX86_BUILTIN_PSRLD256,
26344 IX86_BUILTIN_PSRLQI256,
26345 IX86_BUILTIN_PSRLQ256,
26346 IX86_BUILTIN_PSUBB256,
26347 IX86_BUILTIN_PSUBW256,
26348 IX86_BUILTIN_PSUBD256,
26349 IX86_BUILTIN_PSUBQ256,
26350 IX86_BUILTIN_PSUBSB256,
26351 IX86_BUILTIN_PSUBSW256,
26352 IX86_BUILTIN_PSUBUSB256,
26353 IX86_BUILTIN_PSUBUSW256,
26354 IX86_BUILTIN_PUNPCKHBW256,
26355 IX86_BUILTIN_PUNPCKHWD256,
26356 IX86_BUILTIN_PUNPCKHDQ256,
26357 IX86_BUILTIN_PUNPCKHQDQ256,
26358 IX86_BUILTIN_PUNPCKLBW256,
26359 IX86_BUILTIN_PUNPCKLWD256,
26360 IX86_BUILTIN_PUNPCKLDQ256,
26361 IX86_BUILTIN_PUNPCKLQDQ256,
26362 IX86_BUILTIN_PXOR256,
26363 IX86_BUILTIN_MOVNTDQA256,
26364 IX86_BUILTIN_VBROADCASTSS_PS,
26365 IX86_BUILTIN_VBROADCASTSS_PS256,
26366 IX86_BUILTIN_VBROADCASTSD_PD256,
26367 IX86_BUILTIN_VBROADCASTSI256,
26368 IX86_BUILTIN_PBLENDD256,
26369 IX86_BUILTIN_PBLENDD128,
26370 IX86_BUILTIN_PBROADCASTB256,
26371 IX86_BUILTIN_PBROADCASTW256,
26372 IX86_BUILTIN_PBROADCASTD256,
26373 IX86_BUILTIN_PBROADCASTQ256,
26374 IX86_BUILTIN_PBROADCASTB128,
26375 IX86_BUILTIN_PBROADCASTW128,
26376 IX86_BUILTIN_PBROADCASTD128,
26377 IX86_BUILTIN_PBROADCASTQ128,
26378 IX86_BUILTIN_VPERMVARSI256,
26379 IX86_BUILTIN_VPERMDF256,
26380 IX86_BUILTIN_VPERMVARSF256,
26381 IX86_BUILTIN_VPERMDI256,
26382 IX86_BUILTIN_VPERMTI256,
26383 IX86_BUILTIN_VEXTRACT128I256,
26384 IX86_BUILTIN_VINSERT128I256,
26385 IX86_BUILTIN_MASKLOADD,
26386 IX86_BUILTIN_MASKLOADQ,
26387 IX86_BUILTIN_MASKLOADD256,
26388 IX86_BUILTIN_MASKLOADQ256,
26389 IX86_BUILTIN_MASKSTORED,
26390 IX86_BUILTIN_MASKSTOREQ,
26391 IX86_BUILTIN_MASKSTORED256,
26392 IX86_BUILTIN_MASKSTOREQ256,
26393 IX86_BUILTIN_PSLLVV4DI,
26394 IX86_BUILTIN_PSLLVV2DI,
26395 IX86_BUILTIN_PSLLVV8SI,
26396 IX86_BUILTIN_PSLLVV4SI,
26397 IX86_BUILTIN_PSRAVV8SI,
26398 IX86_BUILTIN_PSRAVV4SI,
26399 IX86_BUILTIN_PSRLVV4DI,
26400 IX86_BUILTIN_PSRLVV2DI,
26401 IX86_BUILTIN_PSRLVV8SI,
26402 IX86_BUILTIN_PSRLVV4SI,
26404 IX86_BUILTIN_GATHERSIV2DF,
26405 IX86_BUILTIN_GATHERSIV4DF,
26406 IX86_BUILTIN_GATHERDIV2DF,
26407 IX86_BUILTIN_GATHERDIV4DF,
26408 IX86_BUILTIN_GATHERSIV4SF,
26409 IX86_BUILTIN_GATHERSIV8SF,
26410 IX86_BUILTIN_GATHERDIV4SF,
26411 IX86_BUILTIN_GATHERDIV8SF,
26412 IX86_BUILTIN_GATHERSIV2DI,
26413 IX86_BUILTIN_GATHERSIV4DI,
26414 IX86_BUILTIN_GATHERDIV2DI,
26415 IX86_BUILTIN_GATHERDIV4DI,
26416 IX86_BUILTIN_GATHERSIV4SI,
26417 IX86_BUILTIN_GATHERSIV8SI,
26418 IX86_BUILTIN_GATHERDIV4SI,
26419 IX86_BUILTIN_GATHERDIV8SI,
26421 /* Alternate 4 element gather for the vectorizer where
26422 all operands are 32-byte wide. */
26423 IX86_BUILTIN_GATHERALTSIV4DF,
26424 IX86_BUILTIN_GATHERALTDIV8SF,
26425 IX86_BUILTIN_GATHERALTSIV4DI,
26426 IX86_BUILTIN_GATHERALTDIV8SI,
26428 /* TFmode support builtins. */
26429 IX86_BUILTIN_INFQ,
26430 IX86_BUILTIN_HUGE_VALQ,
26431 IX86_BUILTIN_FABSQ,
26432 IX86_BUILTIN_COPYSIGNQ,
26434 /* Vectorizer support builtins. */
26435 IX86_BUILTIN_CPYSGNPS,
26436 IX86_BUILTIN_CPYSGNPD,
26437 IX86_BUILTIN_CPYSGNPS256,
26438 IX86_BUILTIN_CPYSGNPD256,
26440 /* FMA4 instructions. */
26441 IX86_BUILTIN_VFMADDSS,
26442 IX86_BUILTIN_VFMADDSD,
26443 IX86_BUILTIN_VFMADDPS,
26444 IX86_BUILTIN_VFMADDPD,
26445 IX86_BUILTIN_VFMADDPS256,
26446 IX86_BUILTIN_VFMADDPD256,
26447 IX86_BUILTIN_VFMADDSUBPS,
26448 IX86_BUILTIN_VFMADDSUBPD,
26449 IX86_BUILTIN_VFMADDSUBPS256,
26450 IX86_BUILTIN_VFMADDSUBPD256,
26452 /* FMA3 instructions. */
26453 IX86_BUILTIN_VFMADDSS3,
26454 IX86_BUILTIN_VFMADDSD3,
26456 /* XOP instructions. */
26457 IX86_BUILTIN_VPCMOV,
26458 IX86_BUILTIN_VPCMOV_V2DI,
26459 IX86_BUILTIN_VPCMOV_V4SI,
26460 IX86_BUILTIN_VPCMOV_V8HI,
26461 IX86_BUILTIN_VPCMOV_V16QI,
26462 IX86_BUILTIN_VPCMOV_V4SF,
26463 IX86_BUILTIN_VPCMOV_V2DF,
26464 IX86_BUILTIN_VPCMOV256,
26465 IX86_BUILTIN_VPCMOV_V4DI256,
26466 IX86_BUILTIN_VPCMOV_V8SI256,
26467 IX86_BUILTIN_VPCMOV_V16HI256,
26468 IX86_BUILTIN_VPCMOV_V32QI256,
26469 IX86_BUILTIN_VPCMOV_V8SF256,
26470 IX86_BUILTIN_VPCMOV_V4DF256,
26472 IX86_BUILTIN_VPPERM,
26474 IX86_BUILTIN_VPMACSSWW,
26475 IX86_BUILTIN_VPMACSWW,
26476 IX86_BUILTIN_VPMACSSWD,
26477 IX86_BUILTIN_VPMACSWD,
26478 IX86_BUILTIN_VPMACSSDD,
26479 IX86_BUILTIN_VPMACSDD,
26480 IX86_BUILTIN_VPMACSSDQL,
26481 IX86_BUILTIN_VPMACSSDQH,
26482 IX86_BUILTIN_VPMACSDQL,
26483 IX86_BUILTIN_VPMACSDQH,
26484 IX86_BUILTIN_VPMADCSSWD,
26485 IX86_BUILTIN_VPMADCSWD,
26487 IX86_BUILTIN_VPHADDBW,
26488 IX86_BUILTIN_VPHADDBD,
26489 IX86_BUILTIN_VPHADDBQ,
26490 IX86_BUILTIN_VPHADDWD,
26491 IX86_BUILTIN_VPHADDWQ,
26492 IX86_BUILTIN_VPHADDDQ,
26493 IX86_BUILTIN_VPHADDUBW,
26494 IX86_BUILTIN_VPHADDUBD,
26495 IX86_BUILTIN_VPHADDUBQ,
26496 IX86_BUILTIN_VPHADDUWD,
26497 IX86_BUILTIN_VPHADDUWQ,
26498 IX86_BUILTIN_VPHADDUDQ,
26499 IX86_BUILTIN_VPHSUBBW,
26500 IX86_BUILTIN_VPHSUBWD,
26501 IX86_BUILTIN_VPHSUBDQ,
26503 IX86_BUILTIN_VPROTB,
26504 IX86_BUILTIN_VPROTW,
26505 IX86_BUILTIN_VPROTD,
26506 IX86_BUILTIN_VPROTQ,
26507 IX86_BUILTIN_VPROTB_IMM,
26508 IX86_BUILTIN_VPROTW_IMM,
26509 IX86_BUILTIN_VPROTD_IMM,
26510 IX86_BUILTIN_VPROTQ_IMM,
26512 IX86_BUILTIN_VPSHLB,
26513 IX86_BUILTIN_VPSHLW,
26514 IX86_BUILTIN_VPSHLD,
26515 IX86_BUILTIN_VPSHLQ,
26516 IX86_BUILTIN_VPSHAB,
26517 IX86_BUILTIN_VPSHAW,
26518 IX86_BUILTIN_VPSHAD,
26519 IX86_BUILTIN_VPSHAQ,
26521 IX86_BUILTIN_VFRCZSS,
26522 IX86_BUILTIN_VFRCZSD,
26523 IX86_BUILTIN_VFRCZPS,
26524 IX86_BUILTIN_VFRCZPD,
26525 IX86_BUILTIN_VFRCZPS256,
26526 IX86_BUILTIN_VFRCZPD256,
26528 IX86_BUILTIN_VPCOMEQUB,
26529 IX86_BUILTIN_VPCOMNEUB,
26530 IX86_BUILTIN_VPCOMLTUB,
26531 IX86_BUILTIN_VPCOMLEUB,
26532 IX86_BUILTIN_VPCOMGTUB,
26533 IX86_BUILTIN_VPCOMGEUB,
26534 IX86_BUILTIN_VPCOMFALSEUB,
26535 IX86_BUILTIN_VPCOMTRUEUB,
26537 IX86_BUILTIN_VPCOMEQUW,
26538 IX86_BUILTIN_VPCOMNEUW,
26539 IX86_BUILTIN_VPCOMLTUW,
26540 IX86_BUILTIN_VPCOMLEUW,
26541 IX86_BUILTIN_VPCOMGTUW,
26542 IX86_BUILTIN_VPCOMGEUW,
26543 IX86_BUILTIN_VPCOMFALSEUW,
26544 IX86_BUILTIN_VPCOMTRUEUW,
26546 IX86_BUILTIN_VPCOMEQUD,
26547 IX86_BUILTIN_VPCOMNEUD,
26548 IX86_BUILTIN_VPCOMLTUD,
26549 IX86_BUILTIN_VPCOMLEUD,
26550 IX86_BUILTIN_VPCOMGTUD,
26551 IX86_BUILTIN_VPCOMGEUD,
26552 IX86_BUILTIN_VPCOMFALSEUD,
26553 IX86_BUILTIN_VPCOMTRUEUD,
26555 IX86_BUILTIN_VPCOMEQUQ,
26556 IX86_BUILTIN_VPCOMNEUQ,
26557 IX86_BUILTIN_VPCOMLTUQ,
26558 IX86_BUILTIN_VPCOMLEUQ,
26559 IX86_BUILTIN_VPCOMGTUQ,
26560 IX86_BUILTIN_VPCOMGEUQ,
26561 IX86_BUILTIN_VPCOMFALSEUQ,
26562 IX86_BUILTIN_VPCOMTRUEUQ,
26564 IX86_BUILTIN_VPCOMEQB,
26565 IX86_BUILTIN_VPCOMNEB,
26566 IX86_BUILTIN_VPCOMLTB,
26567 IX86_BUILTIN_VPCOMLEB,
26568 IX86_BUILTIN_VPCOMGTB,
26569 IX86_BUILTIN_VPCOMGEB,
26570 IX86_BUILTIN_VPCOMFALSEB,
26571 IX86_BUILTIN_VPCOMTRUEB,
26573 IX86_BUILTIN_VPCOMEQW,
26574 IX86_BUILTIN_VPCOMNEW,
26575 IX86_BUILTIN_VPCOMLTW,
26576 IX86_BUILTIN_VPCOMLEW,
26577 IX86_BUILTIN_VPCOMGTW,
26578 IX86_BUILTIN_VPCOMGEW,
26579 IX86_BUILTIN_VPCOMFALSEW,
26580 IX86_BUILTIN_VPCOMTRUEW,
26582 IX86_BUILTIN_VPCOMEQD,
26583 IX86_BUILTIN_VPCOMNED,
26584 IX86_BUILTIN_VPCOMLTD,
26585 IX86_BUILTIN_VPCOMLED,
26586 IX86_BUILTIN_VPCOMGTD,
26587 IX86_BUILTIN_VPCOMGED,
26588 IX86_BUILTIN_VPCOMFALSED,
26589 IX86_BUILTIN_VPCOMTRUED,
26591 IX86_BUILTIN_VPCOMEQQ,
26592 IX86_BUILTIN_VPCOMNEQ,
26593 IX86_BUILTIN_VPCOMLTQ,
26594 IX86_BUILTIN_VPCOMLEQ,
26595 IX86_BUILTIN_VPCOMGTQ,
26596 IX86_BUILTIN_VPCOMGEQ,
26597 IX86_BUILTIN_VPCOMFALSEQ,
26598 IX86_BUILTIN_VPCOMTRUEQ,
26600 /* LWP instructions. */
26601 IX86_BUILTIN_LLWPCB,
26602 IX86_BUILTIN_SLWPCB,
26603 IX86_BUILTIN_LWPVAL32,
26604 IX86_BUILTIN_LWPVAL64,
26605 IX86_BUILTIN_LWPINS32,
26606 IX86_BUILTIN_LWPINS64,
26608 IX86_BUILTIN_CLZS,
26610 /* RTM */
26611 IX86_BUILTIN_XBEGIN,
26612 IX86_BUILTIN_XEND,
26613 IX86_BUILTIN_XABORT,
26614 IX86_BUILTIN_XTEST,
26616 /* BMI instructions. */
26617 IX86_BUILTIN_BEXTR32,
26618 IX86_BUILTIN_BEXTR64,
26619 IX86_BUILTIN_CTZS,
26621 /* TBM instructions. */
26622 IX86_BUILTIN_BEXTRI32,
26623 IX86_BUILTIN_BEXTRI64,
26625 /* BMI2 instructions. */
26626 IX86_BUILTIN_BZHI32,
26627 IX86_BUILTIN_BZHI64,
26628 IX86_BUILTIN_PDEP32,
26629 IX86_BUILTIN_PDEP64,
26630 IX86_BUILTIN_PEXT32,
26631 IX86_BUILTIN_PEXT64,
26633 /* ADX instructions. */
26634 IX86_BUILTIN_ADDCARRYX32,
26635 IX86_BUILTIN_ADDCARRYX64,
26637 /* FSGSBASE instructions. */
26638 IX86_BUILTIN_RDFSBASE32,
26639 IX86_BUILTIN_RDFSBASE64,
26640 IX86_BUILTIN_RDGSBASE32,
26641 IX86_BUILTIN_RDGSBASE64,
26642 IX86_BUILTIN_WRFSBASE32,
26643 IX86_BUILTIN_WRFSBASE64,
26644 IX86_BUILTIN_WRGSBASE32,
26645 IX86_BUILTIN_WRGSBASE64,
26647 /* RDRND instructions. */
26648 IX86_BUILTIN_RDRAND16_STEP,
26649 IX86_BUILTIN_RDRAND32_STEP,
26650 IX86_BUILTIN_RDRAND64_STEP,
26652 /* RDSEED instructions. */
26653 IX86_BUILTIN_RDSEED16_STEP,
26654 IX86_BUILTIN_RDSEED32_STEP,
26655 IX86_BUILTIN_RDSEED64_STEP,
26657 /* F16C instructions. */
26658 IX86_BUILTIN_CVTPH2PS,
26659 IX86_BUILTIN_CVTPH2PS256,
26660 IX86_BUILTIN_CVTPS2PH,
26661 IX86_BUILTIN_CVTPS2PH256,
26663 /* CFString built-in for darwin */
26664 IX86_BUILTIN_CFSTRING,
26666 /* Builtins to get CPU type and supported features. */
26667 IX86_BUILTIN_CPU_INIT,
26668 IX86_BUILTIN_CPU_IS,
26669 IX86_BUILTIN_CPU_SUPPORTS,
26671 IX86_BUILTIN_MAX
26674 /* Table for the ix86 builtin decls. */
26675 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
26677 /* Table of all of the builtin functions that are possible with different ISA's
26678 but are waiting to be built until a function is declared to use that
26679 ISA. */
26680 struct builtin_isa {
26681 const char *name; /* function name */
26682 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
26683 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
26684 bool const_p; /* true if the declaration is constant */
26685 bool set_and_not_built_p;
26688 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
26691 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
26692 of which isa_flags to use in the ix86_builtins_isa array. Stores the
26693 function decl in the ix86_builtins array. Returns the function decl or
26694 NULL_TREE, if the builtin was not added.
26696 If the front end has a special hook for builtin functions, delay adding
26697 builtin functions that aren't in the current ISA until the ISA is changed
26698 with function specific optimization. Doing so, can save about 300K for the
26699 default compiler. When the builtin is expanded, check at that time whether
26700 it is valid.
26702 If the front end doesn't have a special hook, record all builtins, even if
26703 it isn't an instruction set in the current ISA in case the user uses
26704 function specific options for a different ISA, so that we don't get scope
26705 errors if a builtin is added in the middle of a function scope. */
26707 static inline tree
26708 def_builtin (HOST_WIDE_INT mask, const char *name,
26709 enum ix86_builtin_func_type tcode,
26710 enum ix86_builtins code)
26712 tree decl = NULL_TREE;
26714 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
26716 ix86_builtins_isa[(int) code].isa = mask;
26718 mask &= ~OPTION_MASK_ISA_64BIT;
26719 if (mask == 0
26720 || (mask & ix86_isa_flags) != 0
26721 || (lang_hooks.builtin_function
26722 == lang_hooks.builtin_function_ext_scope))
26725 tree type = ix86_get_builtin_func_type (tcode);
26726 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
26727 NULL, NULL_TREE);
26728 ix86_builtins[(int) code] = decl;
26729 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
26731 else
26733 ix86_builtins[(int) code] = NULL_TREE;
26734 ix86_builtins_isa[(int) code].tcode = tcode;
26735 ix86_builtins_isa[(int) code].name = name;
26736 ix86_builtins_isa[(int) code].const_p = false;
26737 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
26741 return decl;
26744 /* Like def_builtin, but also marks the function decl "const". */
26746 static inline tree
26747 def_builtin_const (HOST_WIDE_INT mask, const char *name,
26748 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
26750 tree decl = def_builtin (mask, name, tcode, code);
26751 if (decl)
26752 TREE_READONLY (decl) = 1;
26753 else
26754 ix86_builtins_isa[(int) code].const_p = true;
26756 return decl;
26759 /* Add any new builtin functions for a given ISA that may not have been
26760 declared. This saves a bit of space compared to adding all of the
26761 declarations to the tree, even if we didn't use them. */
26763 static void
26764 ix86_add_new_builtins (HOST_WIDE_INT isa)
26766 int i;
26768 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
26770 if ((ix86_builtins_isa[i].isa & isa) != 0
26771 && ix86_builtins_isa[i].set_and_not_built_p)
26773 tree decl, type;
26775 /* Don't define the builtin again. */
26776 ix86_builtins_isa[i].set_and_not_built_p = false;
26778 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
26779 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
26780 type, i, BUILT_IN_MD, NULL,
26781 NULL_TREE);
26783 ix86_builtins[i] = decl;
26784 if (ix86_builtins_isa[i].const_p)
26785 TREE_READONLY (decl) = 1;
26790 /* Bits for builtin_description.flag. */
26792 /* Set when we don't support the comparison natively, and should
26793 swap_comparison in order to support it. */
26794 #define BUILTIN_DESC_SWAP_OPERANDS 1
26796 struct builtin_description
26798 const HOST_WIDE_INT mask;
26799 const enum insn_code icode;
26800 const char *const name;
26801 const enum ix86_builtins code;
26802 const enum rtx_code comparison;
26803 const int flag;
26806 static const struct builtin_description bdesc_comi[] =
26808 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
26809 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
26810 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
26811 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
26812 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
26813 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
26814 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
26815 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
26816 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
26817 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
26818 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
26819 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
26820 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
26821 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
26822 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
26823 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
26824 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
26825 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
26826 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
26827 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
26828 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
26829 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
26830 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
26831 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
26834 static const struct builtin_description bdesc_pcmpestr[] =
26836 /* SSE4.2 */
26837 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
26838 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
26839 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
26840 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
26841 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
26842 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
26843 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
26846 static const struct builtin_description bdesc_pcmpistr[] =
26848 /* SSE4.2 */
26849 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
26850 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
26851 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
26852 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
26853 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
26854 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
26855 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
26858 /* Special builtins with variable number of arguments. */
26859 static const struct builtin_description bdesc_special_args[] =
26861 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
26862 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
26863 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
26865 /* MMX */
26866 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26868 /* 3DNow! */
26869 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26871 /* FXSR, XSAVE and XSAVEOPT */
26872 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxsave", IX86_BUILTIN_FXSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID },
26873 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxrstor", IX86_BUILTIN_FXRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID },
26874 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xsave", IX86_BUILTIN_XSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26875 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xrstor", IX86_BUILTIN_XRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26876 { OPTION_MASK_ISA_XSAVEOPT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt", IX86_BUILTIN_XSAVEOPT, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26878 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxsave64", IX86_BUILTIN_FXSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID },
26879 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxrstor64", IX86_BUILTIN_FXRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID },
26880 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsave64", IX86_BUILTIN_XSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26881 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xrstor64", IX86_BUILTIN_XRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26882 { OPTION_MASK_ISA_XSAVEOPT | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt64", IX86_BUILTIN_XSAVEOPT64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26884 /* SSE */
26885 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storeups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26886 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26887 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
26889 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
26890 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
26891 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
26892 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
26894 /* SSE or 3DNow!A */
26895 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26896 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
26898 /* SSE2 */
26899 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26900 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26901 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storeupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26902 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storedqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
26903 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26904 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
26905 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
26906 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
26907 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
26908 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loaddqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
26910 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
26911 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
26913 /* SSE3 */
26914 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
26916 /* SSE4.1 */
26917 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
26919 /* SSE4A */
26920 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26921 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26923 /* AVX */
26924 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
26925 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
26927 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
26928 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26929 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26930 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
26931 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
26933 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26934 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26935 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26936 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26937 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loaddqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26938 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storedqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
26939 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26941 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
26942 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26943 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26945 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
26946 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
26947 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
26948 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
26949 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
26950 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
26951 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
26952 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
26954 /* AVX2 */
26955 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
26956 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
26957 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
26958 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
26959 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
26960 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
26961 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
26962 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
26963 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
26965 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
26966 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
26967 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
26968 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
26969 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
26970 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
26972 /* FSGSBASE */
26973 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26974 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26975 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26976 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26977 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26978 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26979 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26980 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26982 /* RTM */
26983 { OPTION_MASK_ISA_RTM, CODE_FOR_xbegin, "__builtin_ia32_xbegin", IX86_BUILTIN_XBEGIN, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26984 { OPTION_MASK_ISA_RTM, CODE_FOR_xend, "__builtin_ia32_xend", IX86_BUILTIN_XEND, UNKNOWN, (int) VOID_FTYPE_VOID },
26985 { OPTION_MASK_ISA_RTM, CODE_FOR_xtest, "__builtin_ia32_xtest", IX86_BUILTIN_XTEST, UNKNOWN, (int) INT_FTYPE_VOID },
26988 /* Builtins with variable number of arguments. */
26989 static const struct builtin_description bdesc_args[] =
26991 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
26992 { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
26993 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
26994 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26995 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26996 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26997 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26999 /* MMX */
27000 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27001 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27002 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27003 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27004 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27005 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27007 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27008 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27009 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27010 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27011 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27012 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27013 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27014 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27016 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27017 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27019 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27020 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27021 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27022 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27024 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27025 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27026 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27027 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27028 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27029 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27031 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27032 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27033 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27034 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27035 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
27036 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
27038 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
27039 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
27040 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
27042 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
27044 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27045 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27046 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
27047 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27048 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27049 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
27051 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27052 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27053 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
27054 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27055 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27056 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
27058 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27059 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27060 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27061 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27063 /* 3DNow! */
27064 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
27065 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
27066 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27067 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27069 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27070 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27071 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27072 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27073 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27074 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27075 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27076 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27077 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27078 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27079 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27080 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27081 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27082 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27083 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27085 /* 3DNow!A */
27086 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
27087 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
27088 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
27089 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27090 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27091 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27093 /* SSE */
27094 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
27095 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27096 { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27097 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27098 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27099 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27100 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
27101 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
27102 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
27103 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
27104 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
27105 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
27107 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27109 { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27110 { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27111 { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27112 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27113 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27114 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27115 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27116 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27118 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
27119 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
27120 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
27121 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27122 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27123 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27124 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
27125 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
27126 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
27127 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27128 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
27129 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27130 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
27131 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
27132 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
27133 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27134 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
27135 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
27136 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
27137 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngtss", IX86_BUILTIN_CMPNGTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27138 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngess", IX86_BUILTIN_CMPNGESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27139 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27141 { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27142 { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27143 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27144 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27146 { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27147 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27148 { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27149 { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27151 { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27153 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27154 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27155 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27156 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27157 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27159 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
27160 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
27161 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
27163 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
27165 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27166 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27167 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27169 { OPTION_MASK_ISA_SSE, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
27170 { OPTION_MASK_ISA_SSE, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
27172 /* SSE MMX or 3Dnow!A */
27173 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27174 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27175 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27177 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27178 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27179 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27180 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27182 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
27183 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
27185 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
27187 /* SSE2 */
27188 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27190 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
27191 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
27192 { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
27193 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
27194 { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
27196 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
27197 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
27198 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
27199 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
27200 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
27202 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
27204 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
27205 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
27206 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
27207 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
27209 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2dq, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27210 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
27211 { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27213 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27214 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27215 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27216 { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27217 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27218 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27219 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27220 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27222 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
27223 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
27224 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
27225 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27226 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
27227 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27228 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
27229 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
27230 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
27231 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27232 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27233 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27234 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
27235 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
27236 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
27237 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27238 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
27239 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
27240 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
27241 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27243 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27244 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27245 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27246 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27248 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27249 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27250 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27251 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27253 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27255 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27256 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27257 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27259 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
27261 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27262 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27263 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27264 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27265 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27266 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27267 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27268 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27270 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27271 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27272 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27273 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27274 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27275 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27276 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27277 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27279 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27280 { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
27282 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27283 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27284 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27285 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27287 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27288 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27290 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27291 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27292 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27293 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27294 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27295 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27297 { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27298 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27299 { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27300 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27302 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27303 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27304 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27305 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27306 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27307 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27308 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27309 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27311 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
27312 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
27313 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
27315 { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27316 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
27318 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
27319 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_widen_umult_even_v4si, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
27321 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
27323 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
27324 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
27325 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
27326 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
27328 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
27329 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27330 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27331 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
27332 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27333 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27334 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
27336 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
27337 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27338 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27339 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
27340 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27341 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27342 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
27344 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27345 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27346 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27347 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27349 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
27350 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
27351 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
27353 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
27355 { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27357 /* SSE2 MMX */
27358 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
27359 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
27361 /* SSE3 */
27362 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
27363 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27365 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27366 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27367 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27368 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27369 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27370 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27372 /* SSSE3 */
27373 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
27374 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
27375 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27376 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
27377 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
27378 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
27380 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27381 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27382 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27383 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27384 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27385 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27386 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27387 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27388 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27389 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27390 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27391 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27392 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
27393 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
27394 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27395 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27396 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27397 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27398 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27399 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27400 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27401 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27402 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27403 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27405 /* SSSE3. */
27406 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
27407 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
27409 /* SSE4.1 */
27410 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27411 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27412 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
27413 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
27414 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27415 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27416 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27417 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
27418 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
27419 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
27421 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
27422 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
27423 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
27424 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
27425 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
27426 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
27427 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
27428 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
27429 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
27430 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
27431 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
27432 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
27433 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27435 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
27436 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27437 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27438 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27439 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27440 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27441 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27442 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27443 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27444 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27445 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
27446 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27448 /* SSE4.1 */
27449 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
27450 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
27451 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27452 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27454 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
27455 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
27456 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
27457 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
27459 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
27460 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
27462 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
27463 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
27465 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
27466 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
27467 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
27468 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
27470 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND },
27471 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND },
27473 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27474 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27476 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27477 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27478 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27480 /* SSE4.2 */
27481 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27482 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
27483 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
27484 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27485 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27487 /* SSE4A */
27488 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
27489 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
27490 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
27491 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27493 /* AES */
27494 { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
27495 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27497 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27498 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27499 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27500 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27502 /* PCLMUL */
27503 { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
27505 /* AVX */
27506 { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27507 { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27508 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27509 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27510 { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27511 { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27512 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27513 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27514 { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27515 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27516 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27517 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27518 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27519 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27520 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27521 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27522 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27523 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27524 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27525 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27526 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27527 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27528 { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27529 { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27530 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27531 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27533 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
27534 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
27535 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
27536 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
27538 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27539 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27540 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
27541 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
27542 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27543 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27544 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27545 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27546 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27547 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27548 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27549 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27550 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27551 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
27552 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
27553 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
27554 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
27555 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
27556 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
27557 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2dq256, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27558 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
27559 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
27560 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
27561 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27562 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27563 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27564 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
27565 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
27566 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
27567 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27568 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
27569 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
27570 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
27571 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
27573 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27574 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27575 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27577 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27578 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27579 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27580 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27581 { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27583 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27585 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27586 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
27588 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
27589 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
27590 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
27591 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
27593 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27594 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
27596 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
27597 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
27599 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
27600 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
27601 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
27602 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
27604 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND },
27605 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
27607 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27608 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27610 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27611 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27612 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27613 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27615 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
27616 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
27617 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
27618 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
27619 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
27620 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
27622 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27623 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27624 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27625 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27626 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27627 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27628 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27629 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27630 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27631 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27632 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27633 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27634 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27635 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27636 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27638 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
27639 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
27641 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27642 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27644 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
27646 /* AVX2 */
27647 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
27648 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
27649 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
27650 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
27651 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256", IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
27652 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256", IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
27653 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256", IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
27654 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256", IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
27655 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27656 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27657 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27658 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27659 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27660 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27661 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27662 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27663 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
27664 { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27665 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27666 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27667 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27668 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
27669 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
27670 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27671 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27672 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27673 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27674 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27675 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27676 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27677 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27678 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27679 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27680 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27681 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27682 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27683 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27684 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
27685 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
27686 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27687 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27688 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27689 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27690 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27691 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27692 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27693 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27694 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27695 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27696 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27697 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27698 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
27699 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
27700 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2 , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
27701 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2 , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
27702 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2 , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
27703 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2 , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
27704 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2 , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
27705 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
27706 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2 , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
27707 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2 , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
27708 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
27709 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
27710 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
27711 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_smult_even_v8si, "__builtin_ia32_pmuldq256", IX86_BUILTIN_PMULDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
27712 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27713 { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27714 { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27715 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27716 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27717 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_umult_even_v8si, "__builtin_ia32_pmuludq256", IX86_BUILTIN_PMULUDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
27718 { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27719 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
27720 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27721 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
27722 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
27723 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
27724 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27725 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27726 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27727 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
27728 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27729 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27730 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27731 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27732 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
27733 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
27734 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27735 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27736 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27737 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27738 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
27739 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27740 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27741 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27742 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27743 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
27744 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
27745 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27746 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27747 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27748 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27749 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27750 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27751 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27752 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27753 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27754 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27755 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27756 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27757 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27758 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27759 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27760 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27761 { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27762 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27763 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
27764 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
27765 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
27766 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
27767 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
27768 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
27769 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
27770 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
27771 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
27772 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
27773 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27774 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
27775 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27776 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27777 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
27778 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27779 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
27780 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
27781 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
27782 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
27783 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27784 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27785 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27786 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27787 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27788 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27789 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27790 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27791 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27792 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27794 { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
27796 /* BMI */
27797 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27798 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27799 { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
27801 /* TBM */
27802 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27803 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27805 /* F16C */
27806 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
27807 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
27808 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
27809 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
27811 /* BMI2 */
27812 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27813 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27814 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27815 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27816 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27817 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27820 /* FMA4 and XOP. */
27821 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
27822 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
27823 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
27824 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
27825 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
27826 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
27827 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
27828 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
27829 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
27830 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
27831 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
27832 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
27833 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
27834 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
27835 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
27836 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
27837 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
27838 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
27839 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
27840 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
27841 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
27842 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
27843 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
27844 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
27845 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
27846 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
27847 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
27848 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
27849 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
27850 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
27851 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
27852 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
27853 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
27854 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
27855 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
27856 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
27857 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
27858 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
27859 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
27860 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
27861 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
27862 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
27863 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
27864 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
27865 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
27866 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
27867 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
27868 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
27869 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
27870 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
27871 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
27872 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
27874 static const struct builtin_description bdesc_multi_arg[] =
27876 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
27877 "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
27878 UNKNOWN, (int)MULTI_ARG_3_SF },
27879 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
27880 "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
27881 UNKNOWN, (int)MULTI_ARG_3_DF },
27883 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
27884 "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
27885 UNKNOWN, (int)MULTI_ARG_3_SF },
27886 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
27887 "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
27888 UNKNOWN, (int)MULTI_ARG_3_DF },
27890 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
27891 "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
27892 UNKNOWN, (int)MULTI_ARG_3_SF },
27893 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
27894 "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
27895 UNKNOWN, (int)MULTI_ARG_3_DF },
27896 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
27897 "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
27898 UNKNOWN, (int)MULTI_ARG_3_SF2 },
27899 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
27900 "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
27901 UNKNOWN, (int)MULTI_ARG_3_DF2 },
27903 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
27904 "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
27905 UNKNOWN, (int)MULTI_ARG_3_SF },
27906 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
27907 "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
27908 UNKNOWN, (int)MULTI_ARG_3_DF },
27909 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
27910 "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
27911 UNKNOWN, (int)MULTI_ARG_3_SF2 },
27912 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
27913 "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
27914 UNKNOWN, (int)MULTI_ARG_3_DF2 },
27916 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI },
27917 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI },
27918 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI },
27919 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI },
27920 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI },
27921 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF },
27922 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF },
27924 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
27925 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
27926 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 },
27927 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 },
27928 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 },
27929 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
27930 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
27932 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI },
27934 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
27935 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
27936 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27937 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27938 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27939 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27940 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27941 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27942 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27943 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27944 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27945 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27947 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27948 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI },
27949 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI },
27950 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI },
27951 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM },
27952 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM },
27953 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM },
27954 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM },
27955 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27956 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI },
27957 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI },
27958 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI },
27959 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27960 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI },
27961 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI },
27962 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI },
27964 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_2_SF },
27965 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_2_DF },
27966 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF },
27967 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF },
27968 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 },
27969 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 },
27971 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27972 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27973 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27974 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27975 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27976 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27977 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27978 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27979 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27980 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27981 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27982 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27983 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27984 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27985 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27987 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP },
27988 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27989 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27990 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP },
27991 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP },
27992 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP },
27993 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP },
27995 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP },
27996 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
27997 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
27998 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP },
27999 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP },
28000 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP },
28001 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP },
28003 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP },
28004 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
28005 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
28006 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP },
28007 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP },
28008 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP },
28009 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP },
28011 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP },
28012 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
28013 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
28014 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP },
28015 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP },
28016 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP },
28017 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP },
28019 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP },
28020 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
28021 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
28022 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP },
28023 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP },
28024 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP },
28025 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP },
28027 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP },
28028 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
28029 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
28030 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP },
28031 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP },
28032 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP },
28033 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP },
28035 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP },
28036 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
28037 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
28038 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP },
28039 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP },
28040 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP },
28041 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP },
28043 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP },
28044 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
28045 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
28046 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP },
28047 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP },
28048 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP },
28049 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP },
28051 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
28052 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
28053 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
28054 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
28055 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
28056 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
28057 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
28058 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
28060 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
28061 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
28062 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
28063 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
28064 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
28065 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
28066 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
28067 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
28069 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
28070 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
28071 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
28072 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
28076 /* TM vector builtins. */
28078 /* Reuse the existing x86-specific `struct builtin_description' cause
28079 we're lazy. Add casts to make them fit. */
28080 static const struct builtin_description bdesc_tm[] =
28082 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28083 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28084 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28085 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28086 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28087 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28088 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28090 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28091 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28092 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28093 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28094 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28095 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28096 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28098 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28099 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28100 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28101 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28102 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28103 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28104 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28106 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
28107 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
28108 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
28111 /* TM callbacks. */
28113 /* Return the builtin decl needed to load a vector of TYPE. */
28115 static tree
28116 ix86_builtin_tm_load (tree type)
28118 if (TREE_CODE (type) == VECTOR_TYPE)
28120 switch (tree_low_cst (TYPE_SIZE (type), 1))
28122 case 64:
28123 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
28124 case 128:
28125 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
28126 case 256:
28127 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
28130 return NULL_TREE;
28133 /* Return the builtin decl needed to store a vector of TYPE. */
28135 static tree
28136 ix86_builtin_tm_store (tree type)
28138 if (TREE_CODE (type) == VECTOR_TYPE)
28140 switch (tree_low_cst (TYPE_SIZE (type), 1))
28142 case 64:
28143 return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
28144 case 128:
28145 return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
28146 case 256:
28147 return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
28150 return NULL_TREE;
28153 /* Initialize the transactional memory vector load/store builtins. */
28155 static void
28156 ix86_init_tm_builtins (void)
28158 enum ix86_builtin_func_type ftype;
28159 const struct builtin_description *d;
28160 size_t i;
28161 tree decl;
28162 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
28163 tree attrs_log, attrs_type_log;
28165 if (!flag_tm)
28166 return;
28168 /* If there are no builtins defined, we must be compiling in a
28169 language without trans-mem support. */
28170 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
28171 return;
28173 /* Use whatever attributes a normal TM load has. */
28174 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
28175 attrs_load = DECL_ATTRIBUTES (decl);
28176 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28177 /* Use whatever attributes a normal TM store has. */
28178 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
28179 attrs_store = DECL_ATTRIBUTES (decl);
28180 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28181 /* Use whatever attributes a normal TM log has. */
28182 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
28183 attrs_log = DECL_ATTRIBUTES (decl);
28184 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28186 for (i = 0, d = bdesc_tm;
28187 i < ARRAY_SIZE (bdesc_tm);
28188 i++, d++)
28190 if ((d->mask & ix86_isa_flags) != 0
28191 || (lang_hooks.builtin_function
28192 == lang_hooks.builtin_function_ext_scope))
28194 tree type, attrs, attrs_type;
28195 enum built_in_function code = (enum built_in_function) d->code;
28197 ftype = (enum ix86_builtin_func_type) d->flag;
28198 type = ix86_get_builtin_func_type (ftype);
28200 if (BUILTIN_TM_LOAD_P (code))
28202 attrs = attrs_load;
28203 attrs_type = attrs_type_load;
28205 else if (BUILTIN_TM_STORE_P (code))
28207 attrs = attrs_store;
28208 attrs_type = attrs_type_store;
28210 else
28212 attrs = attrs_log;
28213 attrs_type = attrs_type_log;
28215 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
28216 /* The builtin without the prefix for
28217 calling it directly. */
28218 d->name + strlen ("__builtin_"),
28219 attrs);
28220 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
28221 set the TYPE_ATTRIBUTES. */
28222 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
28224 set_builtin_decl (code, decl, false);
28229 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
28230 in the current target ISA to allow the user to compile particular modules
28231 with different target specific options that differ from the command line
28232 options. */
28233 static void
28234 ix86_init_mmx_sse_builtins (void)
28236 const struct builtin_description * d;
28237 enum ix86_builtin_func_type ftype;
28238 size_t i;
28240 /* Add all special builtins with variable number of operands. */
28241 for (i = 0, d = bdesc_special_args;
28242 i < ARRAY_SIZE (bdesc_special_args);
28243 i++, d++)
28245 if (d->name == 0)
28246 continue;
28248 ftype = (enum ix86_builtin_func_type) d->flag;
28249 def_builtin (d->mask, d->name, ftype, d->code);
28252 /* Add all builtins with variable number of operands. */
28253 for (i = 0, d = bdesc_args;
28254 i < ARRAY_SIZE (bdesc_args);
28255 i++, d++)
28257 if (d->name == 0)
28258 continue;
28260 ftype = (enum ix86_builtin_func_type) d->flag;
28261 def_builtin_const (d->mask, d->name, ftype, d->code);
28264 /* pcmpestr[im] insns. */
28265 for (i = 0, d = bdesc_pcmpestr;
28266 i < ARRAY_SIZE (bdesc_pcmpestr);
28267 i++, d++)
28269 if (d->code == IX86_BUILTIN_PCMPESTRM128)
28270 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
28271 else
28272 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
28273 def_builtin_const (d->mask, d->name, ftype, d->code);
28276 /* pcmpistr[im] insns. */
28277 for (i = 0, d = bdesc_pcmpistr;
28278 i < ARRAY_SIZE (bdesc_pcmpistr);
28279 i++, d++)
28281 if (d->code == IX86_BUILTIN_PCMPISTRM128)
28282 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
28283 else
28284 ftype = INT_FTYPE_V16QI_V16QI_INT;
28285 def_builtin_const (d->mask, d->name, ftype, d->code);
28288 /* comi/ucomi insns. */
28289 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
28291 if (d->mask == OPTION_MASK_ISA_SSE2)
28292 ftype = INT_FTYPE_V2DF_V2DF;
28293 else
28294 ftype = INT_FTYPE_V4SF_V4SF;
28295 def_builtin_const (d->mask, d->name, ftype, d->code);
28298 /* SSE */
28299 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
28300 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
28301 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
28302 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
28304 /* SSE or 3DNow!A */
28305 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28306 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
28307 IX86_BUILTIN_MASKMOVQ);
28309 /* SSE2 */
28310 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
28311 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
28313 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
28314 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
28315 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
28316 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
28318 /* SSE3. */
28319 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
28320 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
28321 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
28322 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
28324 /* AES */
28325 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
28326 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
28327 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
28328 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
28329 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
28330 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
28331 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
28332 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
28333 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
28334 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
28335 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
28336 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
28338 /* PCLMUL */
28339 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
28340 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
28342 /* RDRND */
28343 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
28344 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
28345 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
28346 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
28347 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
28348 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
28349 IX86_BUILTIN_RDRAND64_STEP);
28351 /* AVX2 */
28352 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
28353 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
28354 IX86_BUILTIN_GATHERSIV2DF);
28356 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
28357 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
28358 IX86_BUILTIN_GATHERSIV4DF);
28360 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
28361 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
28362 IX86_BUILTIN_GATHERDIV2DF);
28364 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
28365 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
28366 IX86_BUILTIN_GATHERDIV4DF);
28368 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
28369 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
28370 IX86_BUILTIN_GATHERSIV4SF);
28372 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
28373 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
28374 IX86_BUILTIN_GATHERSIV8SF);
28376 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
28377 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
28378 IX86_BUILTIN_GATHERDIV4SF);
28380 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
28381 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
28382 IX86_BUILTIN_GATHERDIV8SF);
28384 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
28385 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
28386 IX86_BUILTIN_GATHERSIV2DI);
28388 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
28389 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
28390 IX86_BUILTIN_GATHERSIV4DI);
28392 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
28393 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
28394 IX86_BUILTIN_GATHERDIV2DI);
28396 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
28397 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
28398 IX86_BUILTIN_GATHERDIV4DI);
28400 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
28401 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
28402 IX86_BUILTIN_GATHERSIV4SI);
28404 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
28405 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
28406 IX86_BUILTIN_GATHERSIV8SI);
28408 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
28409 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
28410 IX86_BUILTIN_GATHERDIV4SI);
28412 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
28413 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
28414 IX86_BUILTIN_GATHERDIV8SI);
28416 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
28417 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
28418 IX86_BUILTIN_GATHERALTSIV4DF);
28420 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
28421 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
28422 IX86_BUILTIN_GATHERALTDIV8SF);
28424 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
28425 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
28426 IX86_BUILTIN_GATHERALTSIV4DI);
28428 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
28429 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
28430 IX86_BUILTIN_GATHERALTDIV8SI);
28432 /* RTM. */
28433 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
28434 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
28436 /* MMX access to the vec_init patterns. */
28437 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
28438 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
28440 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
28441 V4HI_FTYPE_HI_HI_HI_HI,
28442 IX86_BUILTIN_VEC_INIT_V4HI);
28444 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
28445 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
28446 IX86_BUILTIN_VEC_INIT_V8QI);
28448 /* Access to the vec_extract patterns. */
28449 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
28450 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
28451 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
28452 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
28453 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
28454 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
28455 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
28456 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
28457 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
28458 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
28460 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28461 "__builtin_ia32_vec_ext_v4hi",
28462 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
28464 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
28465 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
28467 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
28468 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
28470 /* Access to the vec_set patterns. */
28471 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
28472 "__builtin_ia32_vec_set_v2di",
28473 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
28475 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
28476 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
28478 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
28479 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
28481 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
28482 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
28484 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28485 "__builtin_ia32_vec_set_v4hi",
28486 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
28488 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
28489 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
28491 /* RDSEED */
28492 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
28493 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
28494 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
28495 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
28496 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
28497 "__builtin_ia32_rdseed_di_step",
28498 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
28500 /* ADCX */
28501 def_builtin (0, "__builtin_ia32_addcarryx_u32",
28502 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
28503 def_builtin (OPTION_MASK_ISA_64BIT,
28504 "__builtin_ia32_addcarryx_u64",
28505 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
28506 IX86_BUILTIN_ADDCARRYX64);
28508 /* Add FMA4 multi-arg argument instructions */
28509 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
28511 if (d->name == 0)
28512 continue;
28514 ftype = (enum ix86_builtin_func_type) d->flag;
28515 def_builtin_const (d->mask, d->name, ftype, d->code);
28519 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
28520 to return a pointer to VERSION_DECL if the outcome of the expression
28521 formed by PREDICATE_CHAIN is true. This function will be called during
28522 version dispatch to decide which function version to execute. It returns
28523 the basic block at the end, to which more conditions can be added. */
28525 static basic_block
28526 add_condition_to_bb (tree function_decl, tree version_decl,
28527 tree predicate_chain, basic_block new_bb)
28529 gimple return_stmt;
28530 tree convert_expr, result_var;
28531 gimple convert_stmt;
28532 gimple call_cond_stmt;
28533 gimple if_else_stmt;
28535 basic_block bb1, bb2, bb3;
28536 edge e12, e23;
28538 tree cond_var, and_expr_var = NULL_TREE;
28539 gimple_seq gseq;
28541 tree predicate_decl, predicate_arg;
28543 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
28545 gcc_assert (new_bb != NULL);
28546 gseq = bb_seq (new_bb);
28549 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
28550 build_fold_addr_expr (version_decl));
28551 result_var = create_tmp_var (ptr_type_node, NULL);
28552 convert_stmt = gimple_build_assign (result_var, convert_expr);
28553 return_stmt = gimple_build_return (result_var);
28555 if (predicate_chain == NULL_TREE)
28557 gimple_seq_add_stmt (&gseq, convert_stmt);
28558 gimple_seq_add_stmt (&gseq, return_stmt);
28559 set_bb_seq (new_bb, gseq);
28560 gimple_set_bb (convert_stmt, new_bb);
28561 gimple_set_bb (return_stmt, new_bb);
28562 pop_cfun ();
28563 return new_bb;
28566 while (predicate_chain != NULL)
28568 cond_var = create_tmp_var (integer_type_node, NULL);
28569 predicate_decl = TREE_PURPOSE (predicate_chain);
28570 predicate_arg = TREE_VALUE (predicate_chain);
28571 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
28572 gimple_call_set_lhs (call_cond_stmt, cond_var);
28574 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
28575 gimple_set_bb (call_cond_stmt, new_bb);
28576 gimple_seq_add_stmt (&gseq, call_cond_stmt);
28578 predicate_chain = TREE_CHAIN (predicate_chain);
28580 if (and_expr_var == NULL)
28581 and_expr_var = cond_var;
28582 else
28584 gimple assign_stmt;
28585 /* Use MIN_EXPR to check if any integer is zero?.
28586 and_expr_var = min_expr <cond_var, and_expr_var> */
28587 assign_stmt = gimple_build_assign (and_expr_var,
28588 build2 (MIN_EXPR, integer_type_node,
28589 cond_var, and_expr_var));
28591 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
28592 gimple_set_bb (assign_stmt, new_bb);
28593 gimple_seq_add_stmt (&gseq, assign_stmt);
28597 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
28598 integer_zero_node,
28599 NULL_TREE, NULL_TREE);
28600 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
28601 gimple_set_bb (if_else_stmt, new_bb);
28602 gimple_seq_add_stmt (&gseq, if_else_stmt);
28604 gimple_seq_add_stmt (&gseq, convert_stmt);
28605 gimple_seq_add_stmt (&gseq, return_stmt);
28606 set_bb_seq (new_bb, gseq);
28608 bb1 = new_bb;
28609 e12 = split_block (bb1, if_else_stmt);
28610 bb2 = e12->dest;
28611 e12->flags &= ~EDGE_FALLTHRU;
28612 e12->flags |= EDGE_TRUE_VALUE;
28614 e23 = split_block (bb2, return_stmt);
28616 gimple_set_bb (convert_stmt, bb2);
28617 gimple_set_bb (return_stmt, bb2);
28619 bb3 = e23->dest;
28620 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
28622 remove_edge (e23);
28623 make_edge (bb2, EXIT_BLOCK_PTR, 0);
28625 pop_cfun ();
28627 return bb3;
28630 /* This parses the attribute arguments to target in DECL and determines
28631 the right builtin to use to match the platform specification.
28632 It returns the priority value for this version decl. If PREDICATE_LIST
28633 is not NULL, it stores the list of cpu features that need to be checked
28634 before dispatching this function. */
28636 static unsigned int
28637 get_builtin_code_for_version (tree decl, tree *predicate_list)
28639 tree attrs;
28640 struct cl_target_option cur_target;
28641 tree target_node;
28642 struct cl_target_option *new_target;
28643 const char *arg_str = NULL;
28644 const char *attrs_str = NULL;
28645 char *tok_str = NULL;
28646 char *token;
28648 /* Priority of i386 features, greater value is higher priority. This is
28649 used to decide the order in which function dispatch must happen. For
28650 instance, a version specialized for SSE4.2 should be checked for dispatch
28651 before a version for SSE3, as SSE4.2 implies SSE3. */
28652 enum feature_priority
28654 P_ZERO = 0,
28655 P_MMX,
28656 P_SSE,
28657 P_SSE2,
28658 P_SSE3,
28659 P_SSSE3,
28660 P_PROC_SSSE3,
28661 P_SSE4_a,
28662 P_PROC_SSE4_a,
28663 P_SSE4_1,
28664 P_SSE4_2,
28665 P_PROC_SSE4_2,
28666 P_POPCNT,
28667 P_AVX,
28668 P_AVX2,
28669 P_FMA,
28670 P_PROC_FMA
28673 enum feature_priority priority = P_ZERO;
28675 /* These are the target attribute strings for which a dispatcher is
28676 available, from fold_builtin_cpu. */
28678 static struct _feature_list
28680 const char *const name;
28681 const enum feature_priority priority;
28683 const feature_list[] =
28685 {"mmx", P_MMX},
28686 {"sse", P_SSE},
28687 {"sse2", P_SSE2},
28688 {"sse3", P_SSE3},
28689 {"ssse3", P_SSSE3},
28690 {"sse4.1", P_SSE4_1},
28691 {"sse4.2", P_SSE4_2},
28692 {"popcnt", P_POPCNT},
28693 {"avx", P_AVX},
28694 {"avx2", P_AVX2}
28698 static unsigned int NUM_FEATURES
28699 = sizeof (feature_list) / sizeof (struct _feature_list);
28701 unsigned int i;
28703 tree predicate_chain = NULL_TREE;
28704 tree predicate_decl, predicate_arg;
28706 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
28707 gcc_assert (attrs != NULL);
28709 attrs = TREE_VALUE (TREE_VALUE (attrs));
28711 gcc_assert (TREE_CODE (attrs) == STRING_CST);
28712 attrs_str = TREE_STRING_POINTER (attrs);
28715 /* Handle arch= if specified. For priority, set it to be 1 more than
28716 the best instruction set the processor can handle. For instance, if
28717 there is a version for atom and a version for ssse3 (the highest ISA
28718 priority for atom), the atom version must be checked for dispatch
28719 before the ssse3 version. */
28720 if (strstr (attrs_str, "arch=") != NULL)
28722 cl_target_option_save (&cur_target, &global_options);
28723 target_node = ix86_valid_target_attribute_tree (attrs);
28725 gcc_assert (target_node);
28726 new_target = TREE_TARGET_OPTION (target_node);
28727 gcc_assert (new_target);
28729 if (new_target->arch_specified && new_target->arch > 0)
28731 switch (new_target->arch)
28733 case PROCESSOR_CORE2:
28734 arg_str = "core2";
28735 priority = P_PROC_SSSE3;
28736 break;
28737 case PROCESSOR_COREI7:
28738 arg_str = "corei7";
28739 priority = P_PROC_SSE4_2;
28740 break;
28741 case PROCESSOR_ATOM:
28742 arg_str = "atom";
28743 priority = P_PROC_SSSE3;
28744 break;
28745 case PROCESSOR_AMDFAM10:
28746 arg_str = "amdfam10h";
28747 priority = P_PROC_SSE4_a;
28748 break;
28749 case PROCESSOR_BDVER1:
28750 arg_str = "bdver1";
28751 priority = P_PROC_FMA;
28752 break;
28753 case PROCESSOR_BDVER2:
28754 arg_str = "bdver2";
28755 priority = P_PROC_FMA;
28756 break;
28760 cl_target_option_restore (&global_options, &cur_target);
28762 if (predicate_list && arg_str == NULL)
28764 error_at (DECL_SOURCE_LOCATION (decl),
28765 "No dispatcher found for the versioning attributes");
28766 return 0;
28769 if (predicate_list)
28771 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
28772 /* For a C string literal the length includes the trailing NULL. */
28773 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
28774 predicate_chain = tree_cons (predicate_decl, predicate_arg,
28775 predicate_chain);
28779 /* Process feature name. */
28780 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
28781 strcpy (tok_str, attrs_str);
28782 token = strtok (tok_str, ",");
28783 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
28785 while (token != NULL)
28787 /* Do not process "arch=" */
28788 if (strncmp (token, "arch=", 5) == 0)
28790 token = strtok (NULL, ",");
28791 continue;
28793 for (i = 0; i < NUM_FEATURES; ++i)
28795 if (strcmp (token, feature_list[i].name) == 0)
28797 if (predicate_list)
28799 predicate_arg = build_string_literal (
28800 strlen (feature_list[i].name) + 1,
28801 feature_list[i].name);
28802 predicate_chain = tree_cons (predicate_decl, predicate_arg,
28803 predicate_chain);
28805 /* Find the maximum priority feature. */
28806 if (feature_list[i].priority > priority)
28807 priority = feature_list[i].priority;
28809 break;
28812 if (predicate_list && i == NUM_FEATURES)
28814 error_at (DECL_SOURCE_LOCATION (decl),
28815 "No dispatcher found for %s", token);
28816 return 0;
28818 token = strtok (NULL, ",");
28820 free (tok_str);
28822 if (predicate_list && predicate_chain == NULL_TREE)
28824 error_at (DECL_SOURCE_LOCATION (decl),
28825 "No dispatcher found for the versioning attributes : %s",
28826 attrs_str);
28827 return 0;
28829 else if (predicate_list)
28831 predicate_chain = nreverse (predicate_chain);
28832 *predicate_list = predicate_chain;
28835 return priority;
28838 /* This compares the priority of target features in function DECL1
28839 and DECL2. It returns positive value if DECL1 is higher priority,
28840 negative value if DECL2 is higher priority and 0 if they are the
28841 same. */
28843 static int
28844 ix86_compare_version_priority (tree decl1, tree decl2)
28846 unsigned int priority1 = 0;
28847 unsigned int priority2 = 0;
28849 if (lookup_attribute ("target", DECL_ATTRIBUTES (decl1)) != NULL)
28850 priority1 = get_builtin_code_for_version (decl1, NULL);
28852 if (lookup_attribute ("target", DECL_ATTRIBUTES (decl2)) != NULL)
28853 priority2 = get_builtin_code_for_version (decl2, NULL);
28855 return (int)priority1 - (int)priority2;
28858 /* V1 and V2 point to function versions with different priorities
28859 based on the target ISA. This function compares their priorities. */
28861 static int
28862 feature_compare (const void *v1, const void *v2)
28864 typedef struct _function_version_info
28866 tree version_decl;
28867 tree predicate_chain;
28868 unsigned int dispatch_priority;
28869 } function_version_info;
28871 const function_version_info c1 = *(const function_version_info *)v1;
28872 const function_version_info c2 = *(const function_version_info *)v2;
28873 return (c2.dispatch_priority - c1.dispatch_priority);
28876 /* This function generates the dispatch function for
28877 multi-versioned functions. DISPATCH_DECL is the function which will
28878 contain the dispatch logic. FNDECLS are the function choices for
28879 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
28880 in DISPATCH_DECL in which the dispatch code is generated. */
28882 static int
28883 dispatch_function_versions (tree dispatch_decl,
28884 void *fndecls_p,
28885 basic_block *empty_bb)
28887 tree default_decl;
28888 gimple ifunc_cpu_init_stmt;
28889 gimple_seq gseq;
28890 int ix;
28891 tree ele;
28892 vec<tree> *fndecls;
28893 unsigned int num_versions = 0;
28894 unsigned int actual_versions = 0;
28895 unsigned int i;
28897 struct _function_version_info
28899 tree version_decl;
28900 tree predicate_chain;
28901 unsigned int dispatch_priority;
28902 }*function_version_info;
28904 gcc_assert (dispatch_decl != NULL
28905 && fndecls_p != NULL
28906 && empty_bb != NULL);
28908 /*fndecls_p is actually a vector. */
28909 fndecls = static_cast<vec<tree> *> (fndecls_p);
28911 /* At least one more version other than the default. */
28912 num_versions = fndecls->length ();
28913 gcc_assert (num_versions >= 2);
28915 function_version_info = (struct _function_version_info *)
28916 XNEWVEC (struct _function_version_info, (num_versions - 1));
28918 /* The first version in the vector is the default decl. */
28919 default_decl = (*fndecls)[0];
28921 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
28923 gseq = bb_seq (*empty_bb);
28924 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
28925 constructors, so explicity call __builtin_cpu_init here. */
28926 ifunc_cpu_init_stmt = gimple_build_call_vec (
28927 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
28928 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
28929 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
28930 set_bb_seq (*empty_bb, gseq);
28932 pop_cfun ();
28935 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
28937 tree version_decl = ele;
28938 tree predicate_chain = NULL_TREE;
28939 unsigned int priority;
28940 /* Get attribute string, parse it and find the right predicate decl.
28941 The predicate function could be a lengthy combination of many
28942 features, like arch-type and various isa-variants. */
28943 priority = get_builtin_code_for_version (version_decl,
28944 &predicate_chain);
28946 if (predicate_chain == NULL_TREE)
28947 continue;
28949 actual_versions++;
28950 function_version_info [ix - 1].version_decl = version_decl;
28951 function_version_info [ix - 1].predicate_chain = predicate_chain;
28952 function_version_info [ix - 1].dispatch_priority = priority;
28955 /* Sort the versions according to descending order of dispatch priority. The
28956 priority is based on the ISA. This is not a perfect solution. There
28957 could still be ambiguity. If more than one function version is suitable
28958 to execute, which one should be dispatched? In future, allow the user
28959 to specify a dispatch priority next to the version. */
28960 qsort (function_version_info, actual_versions,
28961 sizeof (struct _function_version_info), feature_compare);
28963 for (i = 0; i < actual_versions; ++i)
28964 *empty_bb = add_condition_to_bb (dispatch_decl,
28965 function_version_info[i].version_decl,
28966 function_version_info[i].predicate_chain,
28967 *empty_bb);
28969 /* dispatch default version at the end. */
28970 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
28971 NULL, *empty_bb);
28973 free (function_version_info);
28974 return 0;
28977 /* Comparator function to be used in qsort routine to sort attribute
28978 specification strings to "target". */
28980 static int
28981 attr_strcmp (const void *v1, const void *v2)
28983 const char *c1 = *(char *const*)v1;
28984 const char *c2 = *(char *const*)v2;
28985 return strcmp (c1, c2);
28988 /* ARGLIST is the argument to target attribute. This function tokenizes
28989 the comma separated arguments, sorts them and returns a string which
28990 is a unique identifier for the comma separated arguments. It also
28991 replaces non-identifier characters "=,-" with "_". */
28993 static char *
28994 sorted_attr_string (tree arglist)
28996 tree arg;
28997 size_t str_len_sum = 0;
28998 char **args = NULL;
28999 char *attr_str, *ret_str;
29000 char *attr = NULL;
29001 unsigned int argnum = 1;
29002 unsigned int i;
29004 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
29006 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
29007 size_t len = strlen (str);
29008 str_len_sum += len + 1;
29009 if (arg != arglist)
29010 argnum++;
29011 for (i = 0; i < strlen (str); i++)
29012 if (str[i] == ',')
29013 argnum++;
29016 attr_str = XNEWVEC (char, str_len_sum);
29017 str_len_sum = 0;
29018 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
29020 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
29021 size_t len = strlen (str);
29022 memcpy (attr_str + str_len_sum, str, len);
29023 attr_str[str_len_sum + len] = TREE_CHAIN (arg) ? ',' : '\0';
29024 str_len_sum += len + 1;
29027 /* Replace "=,-" with "_". */
29028 for (i = 0; i < strlen (attr_str); i++)
29029 if (attr_str[i] == '=' || attr_str[i]== '-')
29030 attr_str[i] = '_';
29032 if (argnum == 1)
29033 return attr_str;
29035 args = XNEWVEC (char *, argnum);
29037 i = 0;
29038 attr = strtok (attr_str, ",");
29039 while (attr != NULL)
29041 args[i] = attr;
29042 i++;
29043 attr = strtok (NULL, ",");
29046 qsort (args, argnum, sizeof (char *), attr_strcmp);
29048 ret_str = XNEWVEC (char, str_len_sum);
29049 str_len_sum = 0;
29050 for (i = 0; i < argnum; i++)
29052 size_t len = strlen (args[i]);
29053 memcpy (ret_str + str_len_sum, args[i], len);
29054 ret_str[str_len_sum + len] = i < argnum - 1 ? '_' : '\0';
29055 str_len_sum += len + 1;
29058 XDELETEVEC (args);
29059 XDELETEVEC (attr_str);
29060 return ret_str;
29063 /* This function changes the assembler name for functions that are
29064 versions. If DECL is a function version and has a "target"
29065 attribute, it appends the attribute string to its assembler name. */
29067 static tree
29068 ix86_mangle_function_version_assembler_name (tree decl, tree id)
29070 tree version_attr;
29071 const char *orig_name, *version_string;
29072 char *attr_str, *assembler_name;
29074 if (DECL_DECLARED_INLINE_P (decl)
29075 && lookup_attribute ("gnu_inline",
29076 DECL_ATTRIBUTES (decl)))
29077 error_at (DECL_SOURCE_LOCATION (decl),
29078 "Function versions cannot be marked as gnu_inline,"
29079 " bodies have to be generated");
29081 if (DECL_VIRTUAL_P (decl)
29082 || DECL_VINDEX (decl))
29083 error_at (DECL_SOURCE_LOCATION (decl),
29084 "Virtual function versioning not supported\n");
29086 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
29088 /* target attribute string is NULL for default functions. */
29089 if (version_attr == NULL_TREE)
29090 return id;
29092 orig_name = IDENTIFIER_POINTER (id);
29093 version_string
29094 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
29096 if (strcmp (version_string, "default") == 0)
29097 return id;
29099 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
29100 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
29102 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
29104 /* Allow assembler name to be modified if already set. */
29105 if (DECL_ASSEMBLER_NAME_SET_P (decl))
29106 SET_DECL_RTL (decl, NULL);
29108 tree ret = get_identifier (assembler_name);
29109 XDELETEVEC (attr_str);
29110 XDELETEVEC (assembler_name);
29111 return ret;
29114 /* This function returns true if FN1 and FN2 are versions of the same function,
29115 that is, the target strings of the function decls are different. This assumes
29116 that FN1 and FN2 have the same signature. */
29118 static bool
29119 ix86_function_versions (tree fn1, tree fn2)
29121 tree attr1, attr2;
29122 char *target1, *target2;
29123 bool result;
29125 if (TREE_CODE (fn1) != FUNCTION_DECL
29126 || TREE_CODE (fn2) != FUNCTION_DECL)
29127 return false;
29129 attr1 = lookup_attribute ("target", DECL_ATTRIBUTES (fn1));
29130 attr2 = lookup_attribute ("target", DECL_ATTRIBUTES (fn2));
29132 /* At least one function decl should have the target attribute specified. */
29133 if (attr1 == NULL_TREE && attr2 == NULL_TREE)
29134 return false;
29136 /* Diagnose missing target attribute if one of the decls is already
29137 multi-versioned. */
29138 if (attr1 == NULL_TREE || attr2 == NULL_TREE)
29140 if (DECL_FUNCTION_VERSIONED (fn1) || DECL_FUNCTION_VERSIONED (fn2))
29142 if (attr2 != NULL_TREE)
29144 tree tem = fn1;
29145 fn1 = fn2;
29146 fn2 = tem;
29147 attr1 = attr2;
29149 error_at (DECL_SOURCE_LOCATION (fn2),
29150 "missing %<target%> attribute for multi-versioned %D",
29151 fn2);
29152 error_at (DECL_SOURCE_LOCATION (fn1),
29153 "previous declaration of %D", fn1);
29154 /* Prevent diagnosing of the same error multiple times. */
29155 DECL_ATTRIBUTES (fn2)
29156 = tree_cons (get_identifier ("target"),
29157 copy_node (TREE_VALUE (attr1)),
29158 DECL_ATTRIBUTES (fn2));
29160 return false;
29163 target1 = sorted_attr_string (TREE_VALUE (attr1));
29164 target2 = sorted_attr_string (TREE_VALUE (attr2));
29166 /* The sorted target strings must be different for fn1 and fn2
29167 to be versions. */
29168 if (strcmp (target1, target2) == 0)
29169 result = false;
29170 else
29171 result = true;
29173 XDELETEVEC (target1);
29174 XDELETEVEC (target2);
29176 return result;
29179 static tree
29180 ix86_mangle_decl_assembler_name (tree decl, tree id)
29182 /* For function version, add the target suffix to the assembler name. */
29183 if (TREE_CODE (decl) == FUNCTION_DECL
29184 && DECL_FUNCTION_VERSIONED (decl))
29185 id = ix86_mangle_function_version_assembler_name (decl, id);
29186 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
29187 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
29188 #endif
29190 return id;
29193 /* Return a new name by appending SUFFIX to the DECL name. If make_unique
29194 is true, append the full path name of the source file. */
29196 static char *
29197 make_name (tree decl, const char *suffix, bool make_unique)
29199 char *global_var_name;
29200 int name_len;
29201 const char *name;
29202 const char *unique_name = NULL;
29204 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
29206 /* Get a unique name that can be used globally without any chances
29207 of collision at link time. */
29208 if (make_unique)
29209 unique_name = IDENTIFIER_POINTER (get_file_function_name ("\0"));
29211 name_len = strlen (name) + strlen (suffix) + 2;
29213 if (make_unique)
29214 name_len += strlen (unique_name) + 1;
29215 global_var_name = XNEWVEC (char, name_len);
29217 /* Use '.' to concatenate names as it is demangler friendly. */
29218 if (make_unique)
29219 snprintf (global_var_name, name_len, "%s.%s.%s", name, unique_name,
29220 suffix);
29221 else
29222 snprintf (global_var_name, name_len, "%s.%s", name, suffix);
29224 return global_var_name;
29227 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE) && HAVE_GNU_INDIRECT_FUNCTION
29229 /* Make a dispatcher declaration for the multi-versioned function DECL.
29230 Calls to DECL function will be replaced with calls to the dispatcher
29231 by the front-end. Return the decl created. */
29233 static tree
29234 make_dispatcher_decl (const tree decl)
29236 tree func_decl;
29237 char *func_name;
29238 tree fn_type, func_type;
29239 bool is_uniq = false;
29241 if (TREE_PUBLIC (decl) == 0)
29242 is_uniq = true;
29244 func_name = make_name (decl, "ifunc", is_uniq);
29246 fn_type = TREE_TYPE (decl);
29247 func_type = build_function_type (TREE_TYPE (fn_type),
29248 TYPE_ARG_TYPES (fn_type));
29250 func_decl = build_fn_decl (func_name, func_type);
29251 XDELETEVEC (func_name);
29252 TREE_USED (func_decl) = 1;
29253 DECL_CONTEXT (func_decl) = NULL_TREE;
29254 DECL_INITIAL (func_decl) = error_mark_node;
29255 DECL_ARTIFICIAL (func_decl) = 1;
29256 /* Mark this func as external, the resolver will flip it again if
29257 it gets generated. */
29258 DECL_EXTERNAL (func_decl) = 1;
29259 /* This will be of type IFUNCs have to be externally visible. */
29260 TREE_PUBLIC (func_decl) = 1;
29262 return func_decl;
29265 #endif
29267 /* Returns true if decl is multi-versioned and DECL is the default function,
29268 that is it is not tagged with target specific optimization. */
29270 static bool
29271 is_function_default_version (const tree decl)
29273 if (TREE_CODE (decl) != FUNCTION_DECL
29274 || !DECL_FUNCTION_VERSIONED (decl))
29275 return false;
29276 tree attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
29277 gcc_assert (attr);
29278 attr = TREE_VALUE (TREE_VALUE (attr));
29279 return (TREE_CODE (attr) == STRING_CST
29280 && strcmp (TREE_STRING_POINTER (attr), "default") == 0);
29283 /* Make a dispatcher declaration for the multi-versioned function DECL.
29284 Calls to DECL function will be replaced with calls to the dispatcher
29285 by the front-end. Returns the decl of the dispatcher function. */
29287 static tree
29288 ix86_get_function_versions_dispatcher (void *decl)
29290 tree fn = (tree) decl;
29291 struct cgraph_node *node = NULL;
29292 struct cgraph_node *default_node = NULL;
29293 struct cgraph_function_version_info *node_v = NULL;
29294 struct cgraph_function_version_info *first_v = NULL;
29296 tree dispatch_decl = NULL;
29298 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE) && HAVE_GNU_INDIRECT_FUNCTION
29299 struct cgraph_function_version_info *it_v = NULL;
29300 struct cgraph_node *dispatcher_node = NULL;
29301 struct cgraph_function_version_info *dispatcher_version_info = NULL;
29302 #endif
29304 struct cgraph_function_version_info *default_version_info = NULL;
29306 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
29308 node = cgraph_get_node (fn);
29309 gcc_assert (node != NULL);
29311 node_v = get_cgraph_node_version (node);
29312 gcc_assert (node_v != NULL);
29314 if (node_v->dispatcher_resolver != NULL)
29315 return node_v->dispatcher_resolver;
29317 /* Find the default version and make it the first node. */
29318 first_v = node_v;
29319 /* Go to the beginnig of the chain. */
29320 while (first_v->prev != NULL)
29321 first_v = first_v->prev;
29322 default_version_info = first_v;
29323 while (default_version_info != NULL)
29325 if (is_function_default_version
29326 (default_version_info->this_node->symbol.decl))
29327 break;
29328 default_version_info = default_version_info->next;
29331 /* If there is no default node, just return NULL. */
29332 if (default_version_info == NULL)
29333 return NULL;
29335 /* Make default info the first node. */
29336 if (first_v != default_version_info)
29338 default_version_info->prev->next = default_version_info->next;
29339 if (default_version_info->next)
29340 default_version_info->next->prev = default_version_info->prev;
29341 first_v->prev = default_version_info;
29342 default_version_info->next = first_v;
29343 default_version_info->prev = NULL;
29346 default_node = default_version_info->this_node;
29348 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE) && HAVE_GNU_INDIRECT_FUNCTION
29349 /* Right now, the dispatching is done via ifunc. */
29350 dispatch_decl = make_dispatcher_decl (default_node->symbol.decl);
29352 dispatcher_node = cgraph_get_create_node (dispatch_decl);
29353 gcc_assert (dispatcher_node != NULL);
29354 dispatcher_node->dispatcher_function = 1;
29355 dispatcher_version_info
29356 = insert_new_cgraph_node_version (dispatcher_node);
29357 dispatcher_version_info->next = default_version_info;
29358 dispatcher_node->local.finalized = 1;
29360 /* Set the dispatcher for all the versions. */
29361 it_v = default_version_info;
29362 while (it_v != NULL)
29364 it_v->dispatcher_resolver = dispatch_decl;
29365 it_v = it_v->next;
29367 #else
29368 error_at (DECL_SOURCE_LOCATION (default_node->symbol.decl),
29369 "multiversioning needs ifunc which is not supported "
29370 "in this configuration");
29371 #endif
29372 return dispatch_decl;
29375 /* Makes a function attribute of the form NAME(ARG_NAME) and chains
29376 it to CHAIN. */
29378 static tree
29379 make_attribute (const char *name, const char *arg_name, tree chain)
29381 tree attr_name;
29382 tree attr_arg_name;
29383 tree attr_args;
29384 tree attr;
29386 attr_name = get_identifier (name);
29387 attr_arg_name = build_string (strlen (arg_name), arg_name);
29388 attr_args = tree_cons (NULL_TREE, attr_arg_name, NULL_TREE);
29389 attr = tree_cons (attr_name, attr_args, chain);
29390 return attr;
29393 /* Make the resolver function decl to dispatch the versions of
29394 a multi-versioned function, DEFAULT_DECL. Create an
29395 empty basic block in the resolver and store the pointer in
29396 EMPTY_BB. Return the decl of the resolver function. */
29398 static tree
29399 make_resolver_func (const tree default_decl,
29400 const tree dispatch_decl,
29401 basic_block *empty_bb)
29403 char *resolver_name;
29404 tree decl, type, decl_name, t;
29405 bool is_uniq = false;
29407 /* IFUNC's have to be globally visible. So, if the default_decl is
29408 not, then the name of the IFUNC should be made unique. */
29409 if (TREE_PUBLIC (default_decl) == 0)
29410 is_uniq = true;
29412 /* Append the filename to the resolver function if the versions are
29413 not externally visible. This is because the resolver function has
29414 to be externally visible for the loader to find it. So, appending
29415 the filename will prevent conflicts with a resolver function from
29416 another module which is based on the same version name. */
29417 resolver_name = make_name (default_decl, "resolver", is_uniq);
29419 /* The resolver function should return a (void *). */
29420 type = build_function_type_list (ptr_type_node, NULL_TREE);
29422 decl = build_fn_decl (resolver_name, type);
29423 decl_name = get_identifier (resolver_name);
29424 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
29426 DECL_NAME (decl) = decl_name;
29427 TREE_USED (decl) = 1;
29428 DECL_ARTIFICIAL (decl) = 1;
29429 DECL_IGNORED_P (decl) = 0;
29430 /* IFUNC resolvers have to be externally visible. */
29431 TREE_PUBLIC (decl) = 1;
29432 DECL_UNINLINABLE (decl) = 0;
29434 /* Resolver is not external, body is generated. */
29435 DECL_EXTERNAL (decl) = 0;
29436 DECL_EXTERNAL (dispatch_decl) = 0;
29438 DECL_CONTEXT (decl) = NULL_TREE;
29439 DECL_INITIAL (decl) = make_node (BLOCK);
29440 DECL_STATIC_CONSTRUCTOR (decl) = 0;
29442 if (DECL_COMDAT_GROUP (default_decl)
29443 || TREE_PUBLIC (default_decl))
29445 /* In this case, each translation unit with a call to this
29446 versioned function will put out a resolver. Ensure it
29447 is comdat to keep just one copy. */
29448 DECL_COMDAT (decl) = 1;
29449 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
29451 /* Build result decl and add to function_decl. */
29452 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
29453 DECL_ARTIFICIAL (t) = 1;
29454 DECL_IGNORED_P (t) = 1;
29455 DECL_RESULT (decl) = t;
29457 gimplify_function_tree (decl);
29458 push_cfun (DECL_STRUCT_FUNCTION (decl));
29459 *empty_bb = init_lowered_empty_function (decl, false);
29461 cgraph_add_new_function (decl, true);
29462 cgraph_call_function_insertion_hooks (cgraph_get_create_node (decl));
29464 pop_cfun ();
29466 gcc_assert (dispatch_decl != NULL);
29467 /* Mark dispatch_decl as "ifunc" with resolver as resolver_name. */
29468 DECL_ATTRIBUTES (dispatch_decl)
29469 = make_attribute ("ifunc", resolver_name, DECL_ATTRIBUTES (dispatch_decl));
29471 /* Create the alias for dispatch to resolver here. */
29472 /*cgraph_create_function_alias (dispatch_decl, decl);*/
29473 cgraph_same_body_alias (NULL, dispatch_decl, decl);
29474 XDELETEVEC (resolver_name);
29475 return decl;
29478 /* Generate the dispatching code body to dispatch multi-versioned function
29479 DECL. The target hook is called to process the "target" attributes and
29480 provide the code to dispatch the right function at run-time. NODE points
29481 to the dispatcher decl whose body will be created. */
29483 static tree
29484 ix86_generate_version_dispatcher_body (void *node_p)
29486 tree resolver_decl;
29487 basic_block empty_bb;
29488 vec<tree> fn_ver_vec = vNULL;
29489 tree default_ver_decl;
29490 struct cgraph_node *versn;
29491 struct cgraph_node *node;
29493 struct cgraph_function_version_info *node_version_info = NULL;
29494 struct cgraph_function_version_info *versn_info = NULL;
29496 node = (cgraph_node *)node_p;
29498 node_version_info = get_cgraph_node_version (node);
29499 gcc_assert (node->dispatcher_function
29500 && node_version_info != NULL);
29502 if (node_version_info->dispatcher_resolver)
29503 return node_version_info->dispatcher_resolver;
29505 /* The first version in the chain corresponds to the default version. */
29506 default_ver_decl = node_version_info->next->this_node->symbol.decl;
29508 /* node is going to be an alias, so remove the finalized bit. */
29509 node->local.finalized = false;
29511 resolver_decl = make_resolver_func (default_ver_decl,
29512 node->symbol.decl, &empty_bb);
29514 node_version_info->dispatcher_resolver = resolver_decl;
29516 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
29518 fn_ver_vec.create (2);
29520 for (versn_info = node_version_info->next; versn_info;
29521 versn_info = versn_info->next)
29523 versn = versn_info->this_node;
29524 /* Check for virtual functions here again, as by this time it should
29525 have been determined if this function needs a vtable index or
29526 not. This happens for methods in derived classes that override
29527 virtual methods in base classes but are not explicitly marked as
29528 virtual. */
29529 if (DECL_VINDEX (versn->symbol.decl))
29530 error_at (DECL_SOURCE_LOCATION (versn->symbol.decl),
29531 "Virtual function multiversioning not supported");
29532 fn_ver_vec.safe_push (versn->symbol.decl);
29535 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
29536 fn_ver_vec.release ();
29537 rebuild_cgraph_edges ();
29538 pop_cfun ();
29539 return resolver_decl;
29541 /* This builds the processor_model struct type defined in
29542 libgcc/config/i386/cpuinfo.c */
29544 static tree
29545 build_processor_model_struct (void)
29547 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
29548 "__cpu_features"};
29549 tree field = NULL_TREE, field_chain = NULL_TREE;
29550 int i;
29551 tree type = make_node (RECORD_TYPE);
29553 /* The first 3 fields are unsigned int. */
29554 for (i = 0; i < 3; ++i)
29556 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
29557 get_identifier (field_name[i]), unsigned_type_node);
29558 if (field_chain != NULL_TREE)
29559 DECL_CHAIN (field) = field_chain;
29560 field_chain = field;
29563 /* The last field is an array of unsigned integers of size one. */
29564 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
29565 get_identifier (field_name[3]),
29566 build_array_type (unsigned_type_node,
29567 build_index_type (size_one_node)));
29568 if (field_chain != NULL_TREE)
29569 DECL_CHAIN (field) = field_chain;
29570 field_chain = field;
29572 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
29573 return type;
29576 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
29578 static tree
29579 make_var_decl (tree type, const char *name)
29581 tree new_decl;
29583 new_decl = build_decl (UNKNOWN_LOCATION,
29584 VAR_DECL,
29585 get_identifier(name),
29586 type);
29588 DECL_EXTERNAL (new_decl) = 1;
29589 TREE_STATIC (new_decl) = 1;
29590 TREE_PUBLIC (new_decl) = 1;
29591 DECL_INITIAL (new_decl) = 0;
29592 DECL_ARTIFICIAL (new_decl) = 0;
29593 DECL_PRESERVE_P (new_decl) = 1;
29595 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
29596 assemble_variable (new_decl, 0, 0, 0);
29598 return new_decl;
29601 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
29602 into an integer defined in libgcc/config/i386/cpuinfo.c */
29604 static tree
29605 fold_builtin_cpu (tree fndecl, tree *args)
29607 unsigned int i;
29608 enum ix86_builtins fn_code = (enum ix86_builtins)
29609 DECL_FUNCTION_CODE (fndecl);
29610 tree param_string_cst = NULL;
29612 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
29613 enum processor_features
29615 F_CMOV = 0,
29616 F_MMX,
29617 F_POPCNT,
29618 F_SSE,
29619 F_SSE2,
29620 F_SSE3,
29621 F_SSSE3,
29622 F_SSE4_1,
29623 F_SSE4_2,
29624 F_AVX,
29625 F_AVX2,
29626 F_MAX
29629 /* These are the values for vendor types and cpu types and subtypes
29630 in cpuinfo.c. Cpu types and subtypes should be subtracted by
29631 the corresponding start value. */
29632 enum processor_model
29634 M_INTEL = 1,
29635 M_AMD,
29636 M_CPU_TYPE_START,
29637 M_INTEL_ATOM,
29638 M_INTEL_CORE2,
29639 M_INTEL_COREI7,
29640 M_AMDFAM10H,
29641 M_AMDFAM15H,
29642 M_CPU_SUBTYPE_START,
29643 M_INTEL_COREI7_NEHALEM,
29644 M_INTEL_COREI7_WESTMERE,
29645 M_INTEL_COREI7_SANDYBRIDGE,
29646 M_AMDFAM10H_BARCELONA,
29647 M_AMDFAM10H_SHANGHAI,
29648 M_AMDFAM10H_ISTANBUL,
29649 M_AMDFAM15H_BDVER1,
29650 M_AMDFAM15H_BDVER2,
29651 M_AMDFAM15H_BDVER3
29654 static struct _arch_names_table
29656 const char *const name;
29657 const enum processor_model model;
29659 const arch_names_table[] =
29661 {"amd", M_AMD},
29662 {"intel", M_INTEL},
29663 {"atom", M_INTEL_ATOM},
29664 {"core2", M_INTEL_CORE2},
29665 {"corei7", M_INTEL_COREI7},
29666 {"nehalem", M_INTEL_COREI7_NEHALEM},
29667 {"westmere", M_INTEL_COREI7_WESTMERE},
29668 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
29669 {"amdfam10h", M_AMDFAM10H},
29670 {"barcelona", M_AMDFAM10H_BARCELONA},
29671 {"shanghai", M_AMDFAM10H_SHANGHAI},
29672 {"istanbul", M_AMDFAM10H_ISTANBUL},
29673 {"amdfam15h", M_AMDFAM15H},
29674 {"bdver1", M_AMDFAM15H_BDVER1},
29675 {"bdver2", M_AMDFAM15H_BDVER2},
29676 {"bdver3", M_AMDFAM15H_BDVER3},
29679 static struct _isa_names_table
29681 const char *const name;
29682 const enum processor_features feature;
29684 const isa_names_table[] =
29686 {"cmov", F_CMOV},
29687 {"mmx", F_MMX},
29688 {"popcnt", F_POPCNT},
29689 {"sse", F_SSE},
29690 {"sse2", F_SSE2},
29691 {"sse3", F_SSE3},
29692 {"ssse3", F_SSSE3},
29693 {"sse4.1", F_SSE4_1},
29694 {"sse4.2", F_SSE4_2},
29695 {"avx", F_AVX},
29696 {"avx2", F_AVX2}
29699 tree __processor_model_type = build_processor_model_struct ();
29700 tree __cpu_model_var = make_var_decl (__processor_model_type,
29701 "__cpu_model");
29703 gcc_assert ((args != NULL) && (*args != NULL));
29705 param_string_cst = *args;
29706 while (param_string_cst
29707 && TREE_CODE (param_string_cst) != STRING_CST)
29709 /* *args must be a expr that can contain other EXPRS leading to a
29710 STRING_CST. */
29711 if (!EXPR_P (param_string_cst))
29713 error ("Parameter to builtin must be a string constant or literal");
29714 return integer_zero_node;
29716 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
29719 gcc_assert (param_string_cst);
29721 if (fn_code == IX86_BUILTIN_CPU_IS)
29723 tree ref;
29724 tree field;
29725 tree final;
29727 unsigned int field_val = 0;
29728 unsigned int NUM_ARCH_NAMES
29729 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
29731 for (i = 0; i < NUM_ARCH_NAMES; i++)
29732 if (strcmp (arch_names_table[i].name,
29733 TREE_STRING_POINTER (param_string_cst)) == 0)
29734 break;
29736 if (i == NUM_ARCH_NAMES)
29738 error ("Parameter to builtin not valid: %s",
29739 TREE_STRING_POINTER (param_string_cst));
29740 return integer_zero_node;
29743 field = TYPE_FIELDS (__processor_model_type);
29744 field_val = arch_names_table[i].model;
29746 /* CPU types are stored in the next field. */
29747 if (field_val > M_CPU_TYPE_START
29748 && field_val < M_CPU_SUBTYPE_START)
29750 field = DECL_CHAIN (field);
29751 field_val -= M_CPU_TYPE_START;
29754 /* CPU subtypes are stored in the next field. */
29755 if (field_val > M_CPU_SUBTYPE_START)
29757 field = DECL_CHAIN ( DECL_CHAIN (field));
29758 field_val -= M_CPU_SUBTYPE_START;
29761 /* Get the appropriate field in __cpu_model. */
29762 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
29763 field, NULL_TREE);
29765 /* Check the value. */
29766 final = build2 (EQ_EXPR, unsigned_type_node, ref,
29767 build_int_cstu (unsigned_type_node, field_val));
29768 return build1 (CONVERT_EXPR, integer_type_node, final);
29770 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
29772 tree ref;
29773 tree array_elt;
29774 tree field;
29775 tree final;
29777 unsigned int field_val = 0;
29778 unsigned int NUM_ISA_NAMES
29779 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
29781 for (i = 0; i < NUM_ISA_NAMES; i++)
29782 if (strcmp (isa_names_table[i].name,
29783 TREE_STRING_POINTER (param_string_cst)) == 0)
29784 break;
29786 if (i == NUM_ISA_NAMES)
29788 error ("Parameter to builtin not valid: %s",
29789 TREE_STRING_POINTER (param_string_cst));
29790 return integer_zero_node;
29793 field = TYPE_FIELDS (__processor_model_type);
29794 /* Get the last field, which is __cpu_features. */
29795 while (DECL_CHAIN (field))
29796 field = DECL_CHAIN (field);
29798 /* Get the appropriate field: __cpu_model.__cpu_features */
29799 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
29800 field, NULL_TREE);
29802 /* Access the 0th element of __cpu_features array. */
29803 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
29804 integer_zero_node, NULL_TREE, NULL_TREE);
29806 field_val = (1 << isa_names_table[i].feature);
29807 /* Return __cpu_model.__cpu_features[0] & field_val */
29808 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
29809 build_int_cstu (unsigned_type_node, field_val));
29810 return build1 (CONVERT_EXPR, integer_type_node, final);
29812 gcc_unreachable ();
29815 static tree
29816 ix86_fold_builtin (tree fndecl, int n_args,
29817 tree *args, bool ignore ATTRIBUTE_UNUSED)
29819 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
29821 enum ix86_builtins fn_code = (enum ix86_builtins)
29822 DECL_FUNCTION_CODE (fndecl);
29823 if (fn_code == IX86_BUILTIN_CPU_IS
29824 || fn_code == IX86_BUILTIN_CPU_SUPPORTS)
29826 gcc_assert (n_args == 1);
29827 return fold_builtin_cpu (fndecl, args);
29831 #ifdef SUBTARGET_FOLD_BUILTIN
29832 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
29833 #endif
29835 return NULL_TREE;
29838 /* Make builtins to detect cpu type and features supported. NAME is
29839 the builtin name, CODE is the builtin code, and FTYPE is the function
29840 type of the builtin. */
29842 static void
29843 make_cpu_type_builtin (const char* name, int code,
29844 enum ix86_builtin_func_type ftype, bool is_const)
29846 tree decl;
29847 tree type;
29849 type = ix86_get_builtin_func_type (ftype);
29850 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
29851 NULL, NULL_TREE);
29852 gcc_assert (decl != NULL_TREE);
29853 ix86_builtins[(int) code] = decl;
29854 TREE_READONLY (decl) = is_const;
29857 /* Make builtins to get CPU type and features supported. The created
29858 builtins are :
29860 __builtin_cpu_init (), to detect cpu type and features,
29861 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
29862 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
29865 static void
29866 ix86_init_platform_type_builtins (void)
29868 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
29869 INT_FTYPE_VOID, false);
29870 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
29871 INT_FTYPE_PCCHAR, true);
29872 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
29873 INT_FTYPE_PCCHAR, true);
29876 /* Internal method for ix86_init_builtins. */
29878 static void
29879 ix86_init_builtins_va_builtins_abi (void)
29881 tree ms_va_ref, sysv_va_ref;
29882 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
29883 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
29884 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
29885 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
29887 if (!TARGET_64BIT)
29888 return;
29889 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
29890 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
29891 ms_va_ref = build_reference_type (ms_va_list_type_node);
29892 sysv_va_ref =
29893 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
29895 fnvoid_va_end_ms =
29896 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
29897 fnvoid_va_start_ms =
29898 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
29899 fnvoid_va_end_sysv =
29900 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
29901 fnvoid_va_start_sysv =
29902 build_varargs_function_type_list (void_type_node, sysv_va_ref,
29903 NULL_TREE);
29904 fnvoid_va_copy_ms =
29905 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
29906 NULL_TREE);
29907 fnvoid_va_copy_sysv =
29908 build_function_type_list (void_type_node, sysv_va_ref,
29909 sysv_va_ref, NULL_TREE);
29911 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
29912 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
29913 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
29914 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
29915 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
29916 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
29917 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
29918 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
29919 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
29920 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
29921 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
29922 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
29925 static void
29926 ix86_init_builtin_types (void)
29928 tree float128_type_node, float80_type_node;
29930 /* The __float80 type. */
29931 float80_type_node = long_double_type_node;
29932 if (TYPE_MODE (float80_type_node) != XFmode)
29934 /* The __float80 type. */
29935 float80_type_node = make_node (REAL_TYPE);
29937 TYPE_PRECISION (float80_type_node) = 80;
29938 layout_type (float80_type_node);
29940 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
29942 /* The __float128 type. */
29943 float128_type_node = make_node (REAL_TYPE);
29944 TYPE_PRECISION (float128_type_node) = 128;
29945 layout_type (float128_type_node);
29946 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
29948 /* This macro is built by i386-builtin-types.awk. */
29949 DEFINE_BUILTIN_PRIMITIVE_TYPES;
29952 static void
29953 ix86_init_builtins (void)
29955 tree t;
29957 ix86_init_builtin_types ();
29959 /* Builtins to get CPU type and features. */
29960 ix86_init_platform_type_builtins ();
29962 /* TFmode support builtins. */
29963 def_builtin_const (0, "__builtin_infq",
29964 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
29965 def_builtin_const (0, "__builtin_huge_valq",
29966 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
29968 /* We will expand them to normal call if SSE isn't available since
29969 they are used by libgcc. */
29970 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
29971 t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
29972 BUILT_IN_MD, "__fabstf2", NULL_TREE);
29973 TREE_READONLY (t) = 1;
29974 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
29976 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
29977 t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
29978 BUILT_IN_MD, "__copysigntf3", NULL_TREE);
29979 TREE_READONLY (t) = 1;
29980 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
29982 ix86_init_tm_builtins ();
29983 ix86_init_mmx_sse_builtins ();
29985 if (TARGET_LP64)
29986 ix86_init_builtins_va_builtins_abi ();
29988 #ifdef SUBTARGET_INIT_BUILTINS
29989 SUBTARGET_INIT_BUILTINS;
29990 #endif
29993 /* Return the ix86 builtin for CODE. */
29995 static tree
29996 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
29998 if (code >= IX86_BUILTIN_MAX)
29999 return error_mark_node;
30001 return ix86_builtins[code];
30004 /* Errors in the source file can cause expand_expr to return const0_rtx
30005 where we expect a vector. To avoid crashing, use one of the vector
30006 clear instructions. */
30007 static rtx
30008 safe_vector_operand (rtx x, enum machine_mode mode)
30010 if (x == const0_rtx)
30011 x = CONST0_RTX (mode);
30012 return x;
30015 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
30017 static rtx
30018 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
30020 rtx pat;
30021 tree arg0 = CALL_EXPR_ARG (exp, 0);
30022 tree arg1 = CALL_EXPR_ARG (exp, 1);
30023 rtx op0 = expand_normal (arg0);
30024 rtx op1 = expand_normal (arg1);
30025 enum machine_mode tmode = insn_data[icode].operand[0].mode;
30026 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
30027 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
30029 if (VECTOR_MODE_P (mode0))
30030 op0 = safe_vector_operand (op0, mode0);
30031 if (VECTOR_MODE_P (mode1))
30032 op1 = safe_vector_operand (op1, mode1);
30034 if (optimize || !target
30035 || GET_MODE (target) != tmode
30036 || !insn_data[icode].operand[0].predicate (target, tmode))
30037 target = gen_reg_rtx (tmode);
30039 if (GET_MODE (op1) == SImode && mode1 == TImode)
30041 rtx x = gen_reg_rtx (V4SImode);
30042 emit_insn (gen_sse2_loadd (x, op1));
30043 op1 = gen_lowpart (TImode, x);
30046 if (!insn_data[icode].operand[1].predicate (op0, mode0))
30047 op0 = copy_to_mode_reg (mode0, op0);
30048 if (!insn_data[icode].operand[2].predicate (op1, mode1))
30049 op1 = copy_to_mode_reg (mode1, op1);
30051 pat = GEN_FCN (icode) (target, op0, op1);
30052 if (! pat)
30053 return 0;
30055 emit_insn (pat);
30057 return target;
30060 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
30062 static rtx
30063 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
30064 enum ix86_builtin_func_type m_type,
30065 enum rtx_code sub_code)
30067 rtx pat;
30068 int i;
30069 int nargs;
30070 bool comparison_p = false;
30071 bool tf_p = false;
30072 bool last_arg_constant = false;
30073 int num_memory = 0;
30074 struct {
30075 rtx op;
30076 enum machine_mode mode;
30077 } args[4];
30079 enum machine_mode tmode = insn_data[icode].operand[0].mode;
30081 switch (m_type)
30083 case MULTI_ARG_4_DF2_DI_I:
30084 case MULTI_ARG_4_DF2_DI_I1:
30085 case MULTI_ARG_4_SF2_SI_I:
30086 case MULTI_ARG_4_SF2_SI_I1:
30087 nargs = 4;
30088 last_arg_constant = true;
30089 break;
30091 case MULTI_ARG_3_SF:
30092 case MULTI_ARG_3_DF:
30093 case MULTI_ARG_3_SF2:
30094 case MULTI_ARG_3_DF2:
30095 case MULTI_ARG_3_DI:
30096 case MULTI_ARG_3_SI:
30097 case MULTI_ARG_3_SI_DI:
30098 case MULTI_ARG_3_HI:
30099 case MULTI_ARG_3_HI_SI:
30100 case MULTI_ARG_3_QI:
30101 case MULTI_ARG_3_DI2:
30102 case MULTI_ARG_3_SI2:
30103 case MULTI_ARG_3_HI2:
30104 case MULTI_ARG_3_QI2:
30105 nargs = 3;
30106 break;
30108 case MULTI_ARG_2_SF:
30109 case MULTI_ARG_2_DF:
30110 case MULTI_ARG_2_DI:
30111 case MULTI_ARG_2_SI:
30112 case MULTI_ARG_2_HI:
30113 case MULTI_ARG_2_QI:
30114 nargs = 2;
30115 break;
30117 case MULTI_ARG_2_DI_IMM:
30118 case MULTI_ARG_2_SI_IMM:
30119 case MULTI_ARG_2_HI_IMM:
30120 case MULTI_ARG_2_QI_IMM:
30121 nargs = 2;
30122 last_arg_constant = true;
30123 break;
30125 case MULTI_ARG_1_SF:
30126 case MULTI_ARG_1_DF:
30127 case MULTI_ARG_1_SF2:
30128 case MULTI_ARG_1_DF2:
30129 case MULTI_ARG_1_DI:
30130 case MULTI_ARG_1_SI:
30131 case MULTI_ARG_1_HI:
30132 case MULTI_ARG_1_QI:
30133 case MULTI_ARG_1_SI_DI:
30134 case MULTI_ARG_1_HI_DI:
30135 case MULTI_ARG_1_HI_SI:
30136 case MULTI_ARG_1_QI_DI:
30137 case MULTI_ARG_1_QI_SI:
30138 case MULTI_ARG_1_QI_HI:
30139 nargs = 1;
30140 break;
30142 case MULTI_ARG_2_DI_CMP:
30143 case MULTI_ARG_2_SI_CMP:
30144 case MULTI_ARG_2_HI_CMP:
30145 case MULTI_ARG_2_QI_CMP:
30146 nargs = 2;
30147 comparison_p = true;
30148 break;
30150 case MULTI_ARG_2_SF_TF:
30151 case MULTI_ARG_2_DF_TF:
30152 case MULTI_ARG_2_DI_TF:
30153 case MULTI_ARG_2_SI_TF:
30154 case MULTI_ARG_2_HI_TF:
30155 case MULTI_ARG_2_QI_TF:
30156 nargs = 2;
30157 tf_p = true;
30158 break;
30160 default:
30161 gcc_unreachable ();
30164 if (optimize || !target
30165 || GET_MODE (target) != tmode
30166 || !insn_data[icode].operand[0].predicate (target, tmode))
30167 target = gen_reg_rtx (tmode);
30169 gcc_assert (nargs <= 4);
30171 for (i = 0; i < nargs; i++)
30173 tree arg = CALL_EXPR_ARG (exp, i);
30174 rtx op = expand_normal (arg);
30175 int adjust = (comparison_p) ? 1 : 0;
30176 enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
30178 if (last_arg_constant && i == nargs - 1)
30180 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
30182 enum insn_code new_icode = icode;
30183 switch (icode)
30185 case CODE_FOR_xop_vpermil2v2df3:
30186 case CODE_FOR_xop_vpermil2v4sf3:
30187 case CODE_FOR_xop_vpermil2v4df3:
30188 case CODE_FOR_xop_vpermil2v8sf3:
30189 error ("the last argument must be a 2-bit immediate");
30190 return gen_reg_rtx (tmode);
30191 case CODE_FOR_xop_rotlv2di3:
30192 new_icode = CODE_FOR_rotlv2di3;
30193 goto xop_rotl;
30194 case CODE_FOR_xop_rotlv4si3:
30195 new_icode = CODE_FOR_rotlv4si3;
30196 goto xop_rotl;
30197 case CODE_FOR_xop_rotlv8hi3:
30198 new_icode = CODE_FOR_rotlv8hi3;
30199 goto xop_rotl;
30200 case CODE_FOR_xop_rotlv16qi3:
30201 new_icode = CODE_FOR_rotlv16qi3;
30202 xop_rotl:
30203 if (CONST_INT_P (op))
30205 int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
30206 op = GEN_INT (INTVAL (op) & mask);
30207 gcc_checking_assert
30208 (insn_data[icode].operand[i + 1].predicate (op, mode));
30210 else
30212 gcc_checking_assert
30213 (nargs == 2
30214 && insn_data[new_icode].operand[0].mode == tmode
30215 && insn_data[new_icode].operand[1].mode == tmode
30216 && insn_data[new_icode].operand[2].mode == mode
30217 && insn_data[new_icode].operand[0].predicate
30218 == insn_data[icode].operand[0].predicate
30219 && insn_data[new_icode].operand[1].predicate
30220 == insn_data[icode].operand[1].predicate);
30221 icode = new_icode;
30222 goto non_constant;
30224 break;
30225 default:
30226 gcc_unreachable ();
30230 else
30232 non_constant:
30233 if (VECTOR_MODE_P (mode))
30234 op = safe_vector_operand (op, mode);
30236 /* If we aren't optimizing, only allow one memory operand to be
30237 generated. */
30238 if (memory_operand (op, mode))
30239 num_memory++;
30241 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
30243 if (optimize
30244 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
30245 || num_memory > 1)
30246 op = force_reg (mode, op);
30249 args[i].op = op;
30250 args[i].mode = mode;
30253 switch (nargs)
30255 case 1:
30256 pat = GEN_FCN (icode) (target, args[0].op);
30257 break;
30259 case 2:
30260 if (tf_p)
30261 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
30262 GEN_INT ((int)sub_code));
30263 else if (! comparison_p)
30264 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
30265 else
30267 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
30268 args[0].op,
30269 args[1].op);
30271 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
30273 break;
30275 case 3:
30276 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
30277 break;
30279 case 4:
30280 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
30281 break;
30283 default:
30284 gcc_unreachable ();
30287 if (! pat)
30288 return 0;
30290 emit_insn (pat);
30291 return target;
30294 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
30295 insns with vec_merge. */
30297 static rtx
30298 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
30299 rtx target)
30301 rtx pat;
30302 tree arg0 = CALL_EXPR_ARG (exp, 0);
30303 rtx op1, op0 = expand_normal (arg0);
30304 enum machine_mode tmode = insn_data[icode].operand[0].mode;
30305 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
30307 if (optimize || !target
30308 || GET_MODE (target) != tmode
30309 || !insn_data[icode].operand[0].predicate (target, tmode))
30310 target = gen_reg_rtx (tmode);
30312 if (VECTOR_MODE_P (mode0))
30313 op0 = safe_vector_operand (op0, mode0);
30315 if ((optimize && !register_operand (op0, mode0))
30316 || !insn_data[icode].operand[1].predicate (op0, mode0))
30317 op0 = copy_to_mode_reg (mode0, op0);
30319 op1 = op0;
30320 if (!insn_data[icode].operand[2].predicate (op1, mode0))
30321 op1 = copy_to_mode_reg (mode0, op1);
30323 pat = GEN_FCN (icode) (target, op0, op1);
30324 if (! pat)
30325 return 0;
30326 emit_insn (pat);
30327 return target;
30330 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
30332 static rtx
30333 ix86_expand_sse_compare (const struct builtin_description *d,
30334 tree exp, rtx target, bool swap)
30336 rtx pat;
30337 tree arg0 = CALL_EXPR_ARG (exp, 0);
30338 tree arg1 = CALL_EXPR_ARG (exp, 1);
30339 rtx op0 = expand_normal (arg0);
30340 rtx op1 = expand_normal (arg1);
30341 rtx op2;
30342 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
30343 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
30344 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
30345 enum rtx_code comparison = d->comparison;
30347 if (VECTOR_MODE_P (mode0))
30348 op0 = safe_vector_operand (op0, mode0);
30349 if (VECTOR_MODE_P (mode1))
30350 op1 = safe_vector_operand (op1, mode1);
30352 /* Swap operands if we have a comparison that isn't available in
30353 hardware. */
30354 if (swap)
30356 rtx tmp = gen_reg_rtx (mode1);
30357 emit_move_insn (tmp, op1);
30358 op1 = op0;
30359 op0 = tmp;
30362 if (optimize || !target
30363 || GET_MODE (target) != tmode
30364 || !insn_data[d->icode].operand[0].predicate (target, tmode))
30365 target = gen_reg_rtx (tmode);
30367 if ((optimize && !register_operand (op0, mode0))
30368 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
30369 op0 = copy_to_mode_reg (mode0, op0);
30370 if ((optimize && !register_operand (op1, mode1))
30371 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
30372 op1 = copy_to_mode_reg (mode1, op1);
30374 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
30375 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
30376 if (! pat)
30377 return 0;
30378 emit_insn (pat);
30379 return target;
30382 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
30384 static rtx
30385 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
30386 rtx target)
30388 rtx pat;
30389 tree arg0 = CALL_EXPR_ARG (exp, 0);
30390 tree arg1 = CALL_EXPR_ARG (exp, 1);
30391 rtx op0 = expand_normal (arg0);
30392 rtx op1 = expand_normal (arg1);
30393 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
30394 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
30395 enum rtx_code comparison = d->comparison;
30397 if (VECTOR_MODE_P (mode0))
30398 op0 = safe_vector_operand (op0, mode0);
30399 if (VECTOR_MODE_P (mode1))
30400 op1 = safe_vector_operand (op1, mode1);
30402 /* Swap operands if we have a comparison that isn't available in
30403 hardware. */
30404 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
30406 rtx tmp = op1;
30407 op1 = op0;
30408 op0 = tmp;
30411 target = gen_reg_rtx (SImode);
30412 emit_move_insn (target, const0_rtx);
30413 target = gen_rtx_SUBREG (QImode, target, 0);
30415 if ((optimize && !register_operand (op0, mode0))
30416 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30417 op0 = copy_to_mode_reg (mode0, op0);
30418 if ((optimize && !register_operand (op1, mode1))
30419 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
30420 op1 = copy_to_mode_reg (mode1, op1);
30422 pat = GEN_FCN (d->icode) (op0, op1);
30423 if (! pat)
30424 return 0;
30425 emit_insn (pat);
30426 emit_insn (gen_rtx_SET (VOIDmode,
30427 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30428 gen_rtx_fmt_ee (comparison, QImode,
30429 SET_DEST (pat),
30430 const0_rtx)));
30432 return SUBREG_REG (target);
30435 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
30437 static rtx
30438 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
30439 rtx target)
30441 rtx pat;
30442 tree arg0 = CALL_EXPR_ARG (exp, 0);
30443 rtx op1, op0 = expand_normal (arg0);
30444 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
30445 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
30447 if (optimize || target == 0
30448 || GET_MODE (target) != tmode
30449 || !insn_data[d->icode].operand[0].predicate (target, tmode))
30450 target = gen_reg_rtx (tmode);
30452 if (VECTOR_MODE_P (mode0))
30453 op0 = safe_vector_operand (op0, mode0);
30455 if ((optimize && !register_operand (op0, mode0))
30456 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30457 op0 = copy_to_mode_reg (mode0, op0);
30459 op1 = GEN_INT (d->comparison);
30461 pat = GEN_FCN (d->icode) (target, op0, op1);
30462 if (! pat)
30463 return 0;
30464 emit_insn (pat);
30465 return target;
30468 static rtx
30469 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
30470 tree exp, rtx target)
30472 rtx pat;
30473 tree arg0 = CALL_EXPR_ARG (exp, 0);
30474 tree arg1 = CALL_EXPR_ARG (exp, 1);
30475 rtx op0 = expand_normal (arg0);
30476 rtx op1 = expand_normal (arg1);
30477 rtx op2;
30478 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
30479 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
30480 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
30482 if (optimize || target == 0
30483 || GET_MODE (target) != tmode
30484 || !insn_data[d->icode].operand[0].predicate (target, tmode))
30485 target = gen_reg_rtx (tmode);
30487 op0 = safe_vector_operand (op0, mode0);
30488 op1 = safe_vector_operand (op1, mode1);
30490 if ((optimize && !register_operand (op0, mode0))
30491 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30492 op0 = copy_to_mode_reg (mode0, op0);
30493 if ((optimize && !register_operand (op1, mode1))
30494 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
30495 op1 = copy_to_mode_reg (mode1, op1);
30497 op2 = GEN_INT (d->comparison);
30499 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
30500 if (! pat)
30501 return 0;
30502 emit_insn (pat);
30503 return target;
30506 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
30508 static rtx
30509 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
30510 rtx target)
30512 rtx pat;
30513 tree arg0 = CALL_EXPR_ARG (exp, 0);
30514 tree arg1 = CALL_EXPR_ARG (exp, 1);
30515 rtx op0 = expand_normal (arg0);
30516 rtx op1 = expand_normal (arg1);
30517 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
30518 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
30519 enum rtx_code comparison = d->comparison;
30521 if (VECTOR_MODE_P (mode0))
30522 op0 = safe_vector_operand (op0, mode0);
30523 if (VECTOR_MODE_P (mode1))
30524 op1 = safe_vector_operand (op1, mode1);
30526 target = gen_reg_rtx (SImode);
30527 emit_move_insn (target, const0_rtx);
30528 target = gen_rtx_SUBREG (QImode, target, 0);
30530 if ((optimize && !register_operand (op0, mode0))
30531 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30532 op0 = copy_to_mode_reg (mode0, op0);
30533 if ((optimize && !register_operand (op1, mode1))
30534 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
30535 op1 = copy_to_mode_reg (mode1, op1);
30537 pat = GEN_FCN (d->icode) (op0, op1);
30538 if (! pat)
30539 return 0;
30540 emit_insn (pat);
30541 emit_insn (gen_rtx_SET (VOIDmode,
30542 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30543 gen_rtx_fmt_ee (comparison, QImode,
30544 SET_DEST (pat),
30545 const0_rtx)));
30547 return SUBREG_REG (target);
30550 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
30552 static rtx
30553 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
30554 tree exp, rtx target)
30556 rtx pat;
30557 tree arg0 = CALL_EXPR_ARG (exp, 0);
30558 tree arg1 = CALL_EXPR_ARG (exp, 1);
30559 tree arg2 = CALL_EXPR_ARG (exp, 2);
30560 tree arg3 = CALL_EXPR_ARG (exp, 3);
30561 tree arg4 = CALL_EXPR_ARG (exp, 4);
30562 rtx scratch0, scratch1;
30563 rtx op0 = expand_normal (arg0);
30564 rtx op1 = expand_normal (arg1);
30565 rtx op2 = expand_normal (arg2);
30566 rtx op3 = expand_normal (arg3);
30567 rtx op4 = expand_normal (arg4);
30568 enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
30570 tmode0 = insn_data[d->icode].operand[0].mode;
30571 tmode1 = insn_data[d->icode].operand[1].mode;
30572 modev2 = insn_data[d->icode].operand[2].mode;
30573 modei3 = insn_data[d->icode].operand[3].mode;
30574 modev4 = insn_data[d->icode].operand[4].mode;
30575 modei5 = insn_data[d->icode].operand[5].mode;
30576 modeimm = insn_data[d->icode].operand[6].mode;
30578 if (VECTOR_MODE_P (modev2))
30579 op0 = safe_vector_operand (op0, modev2);
30580 if (VECTOR_MODE_P (modev4))
30581 op2 = safe_vector_operand (op2, modev4);
30583 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
30584 op0 = copy_to_mode_reg (modev2, op0);
30585 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
30586 op1 = copy_to_mode_reg (modei3, op1);
30587 if ((optimize && !register_operand (op2, modev4))
30588 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
30589 op2 = copy_to_mode_reg (modev4, op2);
30590 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
30591 op3 = copy_to_mode_reg (modei5, op3);
30593 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
30595 error ("the fifth argument must be an 8-bit immediate");
30596 return const0_rtx;
30599 if (d->code == IX86_BUILTIN_PCMPESTRI128)
30601 if (optimize || !target
30602 || GET_MODE (target) != tmode0
30603 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
30604 target = gen_reg_rtx (tmode0);
30606 scratch1 = gen_reg_rtx (tmode1);
30608 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
30610 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
30612 if (optimize || !target
30613 || GET_MODE (target) != tmode1
30614 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
30615 target = gen_reg_rtx (tmode1);
30617 scratch0 = gen_reg_rtx (tmode0);
30619 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
30621 else
30623 gcc_assert (d->flag);
30625 scratch0 = gen_reg_rtx (tmode0);
30626 scratch1 = gen_reg_rtx (tmode1);
30628 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
30631 if (! pat)
30632 return 0;
30634 emit_insn (pat);
30636 if (d->flag)
30638 target = gen_reg_rtx (SImode);
30639 emit_move_insn (target, const0_rtx);
30640 target = gen_rtx_SUBREG (QImode, target, 0);
30642 emit_insn
30643 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30644 gen_rtx_fmt_ee (EQ, QImode,
30645 gen_rtx_REG ((enum machine_mode) d->flag,
30646 FLAGS_REG),
30647 const0_rtx)));
30648 return SUBREG_REG (target);
30650 else
30651 return target;
30655 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
30657 static rtx
30658 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
30659 tree exp, rtx target)
30661 rtx pat;
30662 tree arg0 = CALL_EXPR_ARG (exp, 0);
30663 tree arg1 = CALL_EXPR_ARG (exp, 1);
30664 tree arg2 = CALL_EXPR_ARG (exp, 2);
30665 rtx scratch0, scratch1;
30666 rtx op0 = expand_normal (arg0);
30667 rtx op1 = expand_normal (arg1);
30668 rtx op2 = expand_normal (arg2);
30669 enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
30671 tmode0 = insn_data[d->icode].operand[0].mode;
30672 tmode1 = insn_data[d->icode].operand[1].mode;
30673 modev2 = insn_data[d->icode].operand[2].mode;
30674 modev3 = insn_data[d->icode].operand[3].mode;
30675 modeimm = insn_data[d->icode].operand[4].mode;
30677 if (VECTOR_MODE_P (modev2))
30678 op0 = safe_vector_operand (op0, modev2);
30679 if (VECTOR_MODE_P (modev3))
30680 op1 = safe_vector_operand (op1, modev3);
30682 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
30683 op0 = copy_to_mode_reg (modev2, op0);
30684 if ((optimize && !register_operand (op1, modev3))
30685 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
30686 op1 = copy_to_mode_reg (modev3, op1);
30688 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
30690 error ("the third argument must be an 8-bit immediate");
30691 return const0_rtx;
30694 if (d->code == IX86_BUILTIN_PCMPISTRI128)
30696 if (optimize || !target
30697 || GET_MODE (target) != tmode0
30698 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
30699 target = gen_reg_rtx (tmode0);
30701 scratch1 = gen_reg_rtx (tmode1);
30703 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
30705 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
30707 if (optimize || !target
30708 || GET_MODE (target) != tmode1
30709 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
30710 target = gen_reg_rtx (tmode1);
30712 scratch0 = gen_reg_rtx (tmode0);
30714 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
30716 else
30718 gcc_assert (d->flag);
30720 scratch0 = gen_reg_rtx (tmode0);
30721 scratch1 = gen_reg_rtx (tmode1);
30723 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
30726 if (! pat)
30727 return 0;
30729 emit_insn (pat);
30731 if (d->flag)
30733 target = gen_reg_rtx (SImode);
30734 emit_move_insn (target, const0_rtx);
30735 target = gen_rtx_SUBREG (QImode, target, 0);
30737 emit_insn
30738 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30739 gen_rtx_fmt_ee (EQ, QImode,
30740 gen_rtx_REG ((enum machine_mode) d->flag,
30741 FLAGS_REG),
30742 const0_rtx)));
30743 return SUBREG_REG (target);
30745 else
30746 return target;
30749 /* Subroutine of ix86_expand_builtin to take care of insns with
30750 variable number of operands. */
30752 static rtx
30753 ix86_expand_args_builtin (const struct builtin_description *d,
30754 tree exp, rtx target)
30756 rtx pat, real_target;
30757 unsigned int i, nargs;
30758 unsigned int nargs_constant = 0;
30759 int num_memory = 0;
30760 struct
30762 rtx op;
30763 enum machine_mode mode;
30764 } args[4];
30765 bool last_arg_count = false;
30766 enum insn_code icode = d->icode;
30767 const struct insn_data_d *insn_p = &insn_data[icode];
30768 enum machine_mode tmode = insn_p->operand[0].mode;
30769 enum machine_mode rmode = VOIDmode;
30770 bool swap = false;
30771 enum rtx_code comparison = d->comparison;
30773 switch ((enum ix86_builtin_func_type) d->flag)
30775 case V2DF_FTYPE_V2DF_ROUND:
30776 case V4DF_FTYPE_V4DF_ROUND:
30777 case V4SF_FTYPE_V4SF_ROUND:
30778 case V8SF_FTYPE_V8SF_ROUND:
30779 case V4SI_FTYPE_V4SF_ROUND:
30780 case V8SI_FTYPE_V8SF_ROUND:
30781 return ix86_expand_sse_round (d, exp, target);
30782 case V4SI_FTYPE_V2DF_V2DF_ROUND:
30783 case V8SI_FTYPE_V4DF_V4DF_ROUND:
30784 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
30785 case INT_FTYPE_V8SF_V8SF_PTEST:
30786 case INT_FTYPE_V4DI_V4DI_PTEST:
30787 case INT_FTYPE_V4DF_V4DF_PTEST:
30788 case INT_FTYPE_V4SF_V4SF_PTEST:
30789 case INT_FTYPE_V2DI_V2DI_PTEST:
30790 case INT_FTYPE_V2DF_V2DF_PTEST:
30791 return ix86_expand_sse_ptest (d, exp, target);
30792 case FLOAT128_FTYPE_FLOAT128:
30793 case FLOAT_FTYPE_FLOAT:
30794 case INT_FTYPE_INT:
30795 case UINT64_FTYPE_INT:
30796 case UINT16_FTYPE_UINT16:
30797 case INT64_FTYPE_INT64:
30798 case INT64_FTYPE_V4SF:
30799 case INT64_FTYPE_V2DF:
30800 case INT_FTYPE_V16QI:
30801 case INT_FTYPE_V8QI:
30802 case INT_FTYPE_V8SF:
30803 case INT_FTYPE_V4DF:
30804 case INT_FTYPE_V4SF:
30805 case INT_FTYPE_V2DF:
30806 case INT_FTYPE_V32QI:
30807 case V16QI_FTYPE_V16QI:
30808 case V8SI_FTYPE_V8SF:
30809 case V8SI_FTYPE_V4SI:
30810 case V8HI_FTYPE_V8HI:
30811 case V8HI_FTYPE_V16QI:
30812 case V8QI_FTYPE_V8QI:
30813 case V8SF_FTYPE_V8SF:
30814 case V8SF_FTYPE_V8SI:
30815 case V8SF_FTYPE_V4SF:
30816 case V8SF_FTYPE_V8HI:
30817 case V4SI_FTYPE_V4SI:
30818 case V4SI_FTYPE_V16QI:
30819 case V4SI_FTYPE_V4SF:
30820 case V4SI_FTYPE_V8SI:
30821 case V4SI_FTYPE_V8HI:
30822 case V4SI_FTYPE_V4DF:
30823 case V4SI_FTYPE_V2DF:
30824 case V4HI_FTYPE_V4HI:
30825 case V4DF_FTYPE_V4DF:
30826 case V4DF_FTYPE_V4SI:
30827 case V4DF_FTYPE_V4SF:
30828 case V4DF_FTYPE_V2DF:
30829 case V4SF_FTYPE_V4SF:
30830 case V4SF_FTYPE_V4SI:
30831 case V4SF_FTYPE_V8SF:
30832 case V4SF_FTYPE_V4DF:
30833 case V4SF_FTYPE_V8HI:
30834 case V4SF_FTYPE_V2DF:
30835 case V2DI_FTYPE_V2DI:
30836 case V2DI_FTYPE_V16QI:
30837 case V2DI_FTYPE_V8HI:
30838 case V2DI_FTYPE_V4SI:
30839 case V2DF_FTYPE_V2DF:
30840 case V2DF_FTYPE_V4SI:
30841 case V2DF_FTYPE_V4DF:
30842 case V2DF_FTYPE_V4SF:
30843 case V2DF_FTYPE_V2SI:
30844 case V2SI_FTYPE_V2SI:
30845 case V2SI_FTYPE_V4SF:
30846 case V2SI_FTYPE_V2SF:
30847 case V2SI_FTYPE_V2DF:
30848 case V2SF_FTYPE_V2SF:
30849 case V2SF_FTYPE_V2SI:
30850 case V32QI_FTYPE_V32QI:
30851 case V32QI_FTYPE_V16QI:
30852 case V16HI_FTYPE_V16HI:
30853 case V16HI_FTYPE_V8HI:
30854 case V8SI_FTYPE_V8SI:
30855 case V16HI_FTYPE_V16QI:
30856 case V8SI_FTYPE_V16QI:
30857 case V4DI_FTYPE_V16QI:
30858 case V8SI_FTYPE_V8HI:
30859 case V4DI_FTYPE_V8HI:
30860 case V4DI_FTYPE_V4SI:
30861 case V4DI_FTYPE_V2DI:
30862 nargs = 1;
30863 break;
30864 case V4SF_FTYPE_V4SF_VEC_MERGE:
30865 case V2DF_FTYPE_V2DF_VEC_MERGE:
30866 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
30867 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
30868 case V16QI_FTYPE_V16QI_V16QI:
30869 case V16QI_FTYPE_V8HI_V8HI:
30870 case V8QI_FTYPE_V8QI_V8QI:
30871 case V8QI_FTYPE_V4HI_V4HI:
30872 case V8HI_FTYPE_V8HI_V8HI:
30873 case V8HI_FTYPE_V16QI_V16QI:
30874 case V8HI_FTYPE_V4SI_V4SI:
30875 case V8SF_FTYPE_V8SF_V8SF:
30876 case V8SF_FTYPE_V8SF_V8SI:
30877 case V4SI_FTYPE_V4SI_V4SI:
30878 case V4SI_FTYPE_V8HI_V8HI:
30879 case V4SI_FTYPE_V4SF_V4SF:
30880 case V4SI_FTYPE_V2DF_V2DF:
30881 case V4HI_FTYPE_V4HI_V4HI:
30882 case V4HI_FTYPE_V8QI_V8QI:
30883 case V4HI_FTYPE_V2SI_V2SI:
30884 case V4DF_FTYPE_V4DF_V4DF:
30885 case V4DF_FTYPE_V4DF_V4DI:
30886 case V4SF_FTYPE_V4SF_V4SF:
30887 case V4SF_FTYPE_V4SF_V4SI:
30888 case V4SF_FTYPE_V4SF_V2SI:
30889 case V4SF_FTYPE_V4SF_V2DF:
30890 case V4SF_FTYPE_V4SF_DI:
30891 case V4SF_FTYPE_V4SF_SI:
30892 case V2DI_FTYPE_V2DI_V2DI:
30893 case V2DI_FTYPE_V16QI_V16QI:
30894 case V2DI_FTYPE_V4SI_V4SI:
30895 case V2UDI_FTYPE_V4USI_V4USI:
30896 case V2DI_FTYPE_V2DI_V16QI:
30897 case V2DI_FTYPE_V2DF_V2DF:
30898 case V2SI_FTYPE_V2SI_V2SI:
30899 case V2SI_FTYPE_V4HI_V4HI:
30900 case V2SI_FTYPE_V2SF_V2SF:
30901 case V2DF_FTYPE_V2DF_V2DF:
30902 case V2DF_FTYPE_V2DF_V4SF:
30903 case V2DF_FTYPE_V2DF_V2DI:
30904 case V2DF_FTYPE_V2DF_DI:
30905 case V2DF_FTYPE_V2DF_SI:
30906 case V2SF_FTYPE_V2SF_V2SF:
30907 case V1DI_FTYPE_V1DI_V1DI:
30908 case V1DI_FTYPE_V8QI_V8QI:
30909 case V1DI_FTYPE_V2SI_V2SI:
30910 case V32QI_FTYPE_V16HI_V16HI:
30911 case V16HI_FTYPE_V8SI_V8SI:
30912 case V32QI_FTYPE_V32QI_V32QI:
30913 case V16HI_FTYPE_V32QI_V32QI:
30914 case V16HI_FTYPE_V16HI_V16HI:
30915 case V8SI_FTYPE_V4DF_V4DF:
30916 case V8SI_FTYPE_V8SI_V8SI:
30917 case V8SI_FTYPE_V16HI_V16HI:
30918 case V4DI_FTYPE_V4DI_V4DI:
30919 case V4DI_FTYPE_V8SI_V8SI:
30920 case V4UDI_FTYPE_V8USI_V8USI:
30921 if (comparison == UNKNOWN)
30922 return ix86_expand_binop_builtin (icode, exp, target);
30923 nargs = 2;
30924 break;
30925 case V4SF_FTYPE_V4SF_V4SF_SWAP:
30926 case V2DF_FTYPE_V2DF_V2DF_SWAP:
30927 gcc_assert (comparison != UNKNOWN);
30928 nargs = 2;
30929 swap = true;
30930 break;
30931 case V16HI_FTYPE_V16HI_V8HI_COUNT:
30932 case V16HI_FTYPE_V16HI_SI_COUNT:
30933 case V8SI_FTYPE_V8SI_V4SI_COUNT:
30934 case V8SI_FTYPE_V8SI_SI_COUNT:
30935 case V4DI_FTYPE_V4DI_V2DI_COUNT:
30936 case V4DI_FTYPE_V4DI_INT_COUNT:
30937 case V8HI_FTYPE_V8HI_V8HI_COUNT:
30938 case V8HI_FTYPE_V8HI_SI_COUNT:
30939 case V4SI_FTYPE_V4SI_V4SI_COUNT:
30940 case V4SI_FTYPE_V4SI_SI_COUNT:
30941 case V4HI_FTYPE_V4HI_V4HI_COUNT:
30942 case V4HI_FTYPE_V4HI_SI_COUNT:
30943 case V2DI_FTYPE_V2DI_V2DI_COUNT:
30944 case V2DI_FTYPE_V2DI_SI_COUNT:
30945 case V2SI_FTYPE_V2SI_V2SI_COUNT:
30946 case V2SI_FTYPE_V2SI_SI_COUNT:
30947 case V1DI_FTYPE_V1DI_V1DI_COUNT:
30948 case V1DI_FTYPE_V1DI_SI_COUNT:
30949 nargs = 2;
30950 last_arg_count = true;
30951 break;
30952 case UINT64_FTYPE_UINT64_UINT64:
30953 case UINT_FTYPE_UINT_UINT:
30954 case UINT_FTYPE_UINT_USHORT:
30955 case UINT_FTYPE_UINT_UCHAR:
30956 case UINT16_FTYPE_UINT16_INT:
30957 case UINT8_FTYPE_UINT8_INT:
30958 nargs = 2;
30959 break;
30960 case V2DI_FTYPE_V2DI_INT_CONVERT:
30961 nargs = 2;
30962 rmode = V1TImode;
30963 nargs_constant = 1;
30964 break;
30965 case V4DI_FTYPE_V4DI_INT_CONVERT:
30966 nargs = 2;
30967 rmode = V2TImode;
30968 nargs_constant = 1;
30969 break;
30970 case V8HI_FTYPE_V8HI_INT:
30971 case V8HI_FTYPE_V8SF_INT:
30972 case V8HI_FTYPE_V4SF_INT:
30973 case V8SF_FTYPE_V8SF_INT:
30974 case V4SI_FTYPE_V4SI_INT:
30975 case V4SI_FTYPE_V8SI_INT:
30976 case V4HI_FTYPE_V4HI_INT:
30977 case V4DF_FTYPE_V4DF_INT:
30978 case V4SF_FTYPE_V4SF_INT:
30979 case V4SF_FTYPE_V8SF_INT:
30980 case V2DI_FTYPE_V2DI_INT:
30981 case V2DF_FTYPE_V2DF_INT:
30982 case V2DF_FTYPE_V4DF_INT:
30983 case V16HI_FTYPE_V16HI_INT:
30984 case V8SI_FTYPE_V8SI_INT:
30985 case V4DI_FTYPE_V4DI_INT:
30986 case V2DI_FTYPE_V4DI_INT:
30987 nargs = 2;
30988 nargs_constant = 1;
30989 break;
30990 case V16QI_FTYPE_V16QI_V16QI_V16QI:
30991 case V8SF_FTYPE_V8SF_V8SF_V8SF:
30992 case V4DF_FTYPE_V4DF_V4DF_V4DF:
30993 case V4SF_FTYPE_V4SF_V4SF_V4SF:
30994 case V2DF_FTYPE_V2DF_V2DF_V2DF:
30995 case V32QI_FTYPE_V32QI_V32QI_V32QI:
30996 nargs = 3;
30997 break;
30998 case V32QI_FTYPE_V32QI_V32QI_INT:
30999 case V16HI_FTYPE_V16HI_V16HI_INT:
31000 case V16QI_FTYPE_V16QI_V16QI_INT:
31001 case V4DI_FTYPE_V4DI_V4DI_INT:
31002 case V8HI_FTYPE_V8HI_V8HI_INT:
31003 case V8SI_FTYPE_V8SI_V8SI_INT:
31004 case V8SI_FTYPE_V8SI_V4SI_INT:
31005 case V8SF_FTYPE_V8SF_V8SF_INT:
31006 case V8SF_FTYPE_V8SF_V4SF_INT:
31007 case V4SI_FTYPE_V4SI_V4SI_INT:
31008 case V4DF_FTYPE_V4DF_V4DF_INT:
31009 case V4DF_FTYPE_V4DF_V2DF_INT:
31010 case V4SF_FTYPE_V4SF_V4SF_INT:
31011 case V2DI_FTYPE_V2DI_V2DI_INT:
31012 case V4DI_FTYPE_V4DI_V2DI_INT:
31013 case V2DF_FTYPE_V2DF_V2DF_INT:
31014 nargs = 3;
31015 nargs_constant = 1;
31016 break;
31017 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
31018 nargs = 3;
31019 rmode = V4DImode;
31020 nargs_constant = 1;
31021 break;
31022 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
31023 nargs = 3;
31024 rmode = V2DImode;
31025 nargs_constant = 1;
31026 break;
31027 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
31028 nargs = 3;
31029 rmode = DImode;
31030 nargs_constant = 1;
31031 break;
31032 case V2DI_FTYPE_V2DI_UINT_UINT:
31033 nargs = 3;
31034 nargs_constant = 2;
31035 break;
31036 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
31037 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
31038 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
31039 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
31040 nargs = 4;
31041 nargs_constant = 1;
31042 break;
31043 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
31044 nargs = 4;
31045 nargs_constant = 2;
31046 break;
31047 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
31048 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
31049 nargs = 4;
31050 break;
31051 default:
31052 gcc_unreachable ();
31055 gcc_assert (nargs <= ARRAY_SIZE (args));
31057 if (comparison != UNKNOWN)
31059 gcc_assert (nargs == 2);
31060 return ix86_expand_sse_compare (d, exp, target, swap);
31063 if (rmode == VOIDmode || rmode == tmode)
31065 if (optimize
31066 || target == 0
31067 || GET_MODE (target) != tmode
31068 || !insn_p->operand[0].predicate (target, tmode))
31069 target = gen_reg_rtx (tmode);
31070 real_target = target;
31072 else
31074 target = gen_reg_rtx (rmode);
31075 real_target = simplify_gen_subreg (tmode, target, rmode, 0);
31078 for (i = 0; i < nargs; i++)
31080 tree arg = CALL_EXPR_ARG (exp, i);
31081 rtx op = expand_normal (arg);
31082 enum machine_mode mode = insn_p->operand[i + 1].mode;
31083 bool match = insn_p->operand[i + 1].predicate (op, mode);
31085 if (last_arg_count && (i + 1) == nargs)
31087 /* SIMD shift insns take either an 8-bit immediate or
31088 register as count. But builtin functions take int as
31089 count. If count doesn't match, we put it in register. */
31090 if (!match)
31092 op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
31093 if (!insn_p->operand[i + 1].predicate (op, mode))
31094 op = copy_to_reg (op);
31097 else if ((nargs - i) <= nargs_constant)
31099 if (!match)
31100 switch (icode)
31102 case CODE_FOR_avx2_inserti128:
31103 case CODE_FOR_avx2_extracti128:
31104 error ("the last argument must be an 1-bit immediate");
31105 return const0_rtx;
31107 case CODE_FOR_sse4_1_roundsd:
31108 case CODE_FOR_sse4_1_roundss:
31110 case CODE_FOR_sse4_1_roundpd:
31111 case CODE_FOR_sse4_1_roundps:
31112 case CODE_FOR_avx_roundpd256:
31113 case CODE_FOR_avx_roundps256:
31115 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
31116 case CODE_FOR_sse4_1_roundps_sfix:
31117 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
31118 case CODE_FOR_avx_roundps_sfix256:
31120 case CODE_FOR_sse4_1_blendps:
31121 case CODE_FOR_avx_blendpd256:
31122 case CODE_FOR_avx_vpermilv4df:
31123 error ("the last argument must be a 4-bit immediate");
31124 return const0_rtx;
31126 case CODE_FOR_sse4_1_blendpd:
31127 case CODE_FOR_avx_vpermilv2df:
31128 case CODE_FOR_xop_vpermil2v2df3:
31129 case CODE_FOR_xop_vpermil2v4sf3:
31130 case CODE_FOR_xop_vpermil2v4df3:
31131 case CODE_FOR_xop_vpermil2v8sf3:
31132 error ("the last argument must be a 2-bit immediate");
31133 return const0_rtx;
31135 case CODE_FOR_avx_vextractf128v4df:
31136 case CODE_FOR_avx_vextractf128v8sf:
31137 case CODE_FOR_avx_vextractf128v8si:
31138 case CODE_FOR_avx_vinsertf128v4df:
31139 case CODE_FOR_avx_vinsertf128v8sf:
31140 case CODE_FOR_avx_vinsertf128v8si:
31141 error ("the last argument must be a 1-bit immediate");
31142 return const0_rtx;
31144 case CODE_FOR_avx_vmcmpv2df3:
31145 case CODE_FOR_avx_vmcmpv4sf3:
31146 case CODE_FOR_avx_cmpv2df3:
31147 case CODE_FOR_avx_cmpv4sf3:
31148 case CODE_FOR_avx_cmpv4df3:
31149 case CODE_FOR_avx_cmpv8sf3:
31150 error ("the last argument must be a 5-bit immediate");
31151 return const0_rtx;
31153 default:
31154 switch (nargs_constant)
31156 case 2:
31157 if ((nargs - i) == nargs_constant)
31159 error ("the next to last argument must be an 8-bit immediate");
31160 break;
31162 case 1:
31163 error ("the last argument must be an 8-bit immediate");
31164 break;
31165 default:
31166 gcc_unreachable ();
31168 return const0_rtx;
31171 else
31173 if (VECTOR_MODE_P (mode))
31174 op = safe_vector_operand (op, mode);
31176 /* If we aren't optimizing, only allow one memory operand to
31177 be generated. */
31178 if (memory_operand (op, mode))
31179 num_memory++;
31181 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
31183 if (optimize || !match || num_memory > 1)
31184 op = copy_to_mode_reg (mode, op);
31186 else
31188 op = copy_to_reg (op);
31189 op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
31193 args[i].op = op;
31194 args[i].mode = mode;
31197 switch (nargs)
31199 case 1:
31200 pat = GEN_FCN (icode) (real_target, args[0].op);
31201 break;
31202 case 2:
31203 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
31204 break;
31205 case 3:
31206 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
31207 args[2].op);
31208 break;
31209 case 4:
31210 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
31211 args[2].op, args[3].op);
31212 break;
31213 default:
31214 gcc_unreachable ();
31217 if (! pat)
31218 return 0;
31220 emit_insn (pat);
31221 return target;
31224 /* Subroutine of ix86_expand_builtin to take care of special insns
31225 with variable number of operands. */
31227 static rtx
31228 ix86_expand_special_args_builtin (const struct builtin_description *d,
31229 tree exp, rtx target)
31231 tree arg;
31232 rtx pat, op;
31233 unsigned int i, nargs, arg_adjust, memory;
31234 struct
31236 rtx op;
31237 enum machine_mode mode;
31238 } args[3];
31239 enum insn_code icode = d->icode;
31240 bool last_arg_constant = false;
31241 const struct insn_data_d *insn_p = &insn_data[icode];
31242 enum machine_mode tmode = insn_p->operand[0].mode;
31243 enum { load, store } klass;
31245 switch ((enum ix86_builtin_func_type) d->flag)
31247 case VOID_FTYPE_VOID:
31248 emit_insn (GEN_FCN (icode) (target));
31249 return 0;
31250 case VOID_FTYPE_UINT64:
31251 case VOID_FTYPE_UNSIGNED:
31252 nargs = 0;
31253 klass = store;
31254 memory = 0;
31255 break;
31257 case INT_FTYPE_VOID:
31258 case UINT64_FTYPE_VOID:
31259 case UNSIGNED_FTYPE_VOID:
31260 nargs = 0;
31261 klass = load;
31262 memory = 0;
31263 break;
31264 case UINT64_FTYPE_PUNSIGNED:
31265 case V2DI_FTYPE_PV2DI:
31266 case V4DI_FTYPE_PV4DI:
31267 case V32QI_FTYPE_PCCHAR:
31268 case V16QI_FTYPE_PCCHAR:
31269 case V8SF_FTYPE_PCV4SF:
31270 case V8SF_FTYPE_PCFLOAT:
31271 case V4SF_FTYPE_PCFLOAT:
31272 case V4DF_FTYPE_PCV2DF:
31273 case V4DF_FTYPE_PCDOUBLE:
31274 case V2DF_FTYPE_PCDOUBLE:
31275 case VOID_FTYPE_PVOID:
31276 nargs = 1;
31277 klass = load;
31278 memory = 0;
31279 break;
31280 case VOID_FTYPE_PV2SF_V4SF:
31281 case VOID_FTYPE_PV4DI_V4DI:
31282 case VOID_FTYPE_PV2DI_V2DI:
31283 case VOID_FTYPE_PCHAR_V32QI:
31284 case VOID_FTYPE_PCHAR_V16QI:
31285 case VOID_FTYPE_PFLOAT_V8SF:
31286 case VOID_FTYPE_PFLOAT_V4SF:
31287 case VOID_FTYPE_PDOUBLE_V4DF:
31288 case VOID_FTYPE_PDOUBLE_V2DF:
31289 case VOID_FTYPE_PLONGLONG_LONGLONG:
31290 case VOID_FTYPE_PULONGLONG_ULONGLONG:
31291 case VOID_FTYPE_PINT_INT:
31292 nargs = 1;
31293 klass = store;
31294 /* Reserve memory operand for target. */
31295 memory = ARRAY_SIZE (args);
31296 break;
31297 case V4SF_FTYPE_V4SF_PCV2SF:
31298 case V2DF_FTYPE_V2DF_PCDOUBLE:
31299 nargs = 2;
31300 klass = load;
31301 memory = 1;
31302 break;
31303 case V8SF_FTYPE_PCV8SF_V8SI:
31304 case V4DF_FTYPE_PCV4DF_V4DI:
31305 case V4SF_FTYPE_PCV4SF_V4SI:
31306 case V2DF_FTYPE_PCV2DF_V2DI:
31307 case V8SI_FTYPE_PCV8SI_V8SI:
31308 case V4DI_FTYPE_PCV4DI_V4DI:
31309 case V4SI_FTYPE_PCV4SI_V4SI:
31310 case V2DI_FTYPE_PCV2DI_V2DI:
31311 nargs = 2;
31312 klass = load;
31313 memory = 0;
31314 break;
31315 case VOID_FTYPE_PV8SF_V8SI_V8SF:
31316 case VOID_FTYPE_PV4DF_V4DI_V4DF:
31317 case VOID_FTYPE_PV4SF_V4SI_V4SF:
31318 case VOID_FTYPE_PV2DF_V2DI_V2DF:
31319 case VOID_FTYPE_PV8SI_V8SI_V8SI:
31320 case VOID_FTYPE_PV4DI_V4DI_V4DI:
31321 case VOID_FTYPE_PV4SI_V4SI_V4SI:
31322 case VOID_FTYPE_PV2DI_V2DI_V2DI:
31323 nargs = 2;
31324 klass = store;
31325 /* Reserve memory operand for target. */
31326 memory = ARRAY_SIZE (args);
31327 break;
31328 case VOID_FTYPE_UINT_UINT_UINT:
31329 case VOID_FTYPE_UINT64_UINT_UINT:
31330 case UCHAR_FTYPE_UINT_UINT_UINT:
31331 case UCHAR_FTYPE_UINT64_UINT_UINT:
31332 nargs = 3;
31333 klass = load;
31334 memory = ARRAY_SIZE (args);
31335 last_arg_constant = true;
31336 break;
31337 default:
31338 gcc_unreachable ();
31341 gcc_assert (nargs <= ARRAY_SIZE (args));
31343 if (klass == store)
31345 arg = CALL_EXPR_ARG (exp, 0);
31346 op = expand_normal (arg);
31347 gcc_assert (target == 0);
31348 if (memory)
31350 op = force_reg (Pmode, convert_to_mode (Pmode, op, 1));
31351 target = gen_rtx_MEM (tmode, op);
31353 else
31354 target = force_reg (tmode, op);
31355 arg_adjust = 1;
31357 else
31359 arg_adjust = 0;
31360 if (optimize
31361 || target == 0
31362 || !register_operand (target, tmode)
31363 || GET_MODE (target) != tmode)
31364 target = gen_reg_rtx (tmode);
31367 for (i = 0; i < nargs; i++)
31369 enum machine_mode mode = insn_p->operand[i + 1].mode;
31370 bool match;
31372 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
31373 op = expand_normal (arg);
31374 match = insn_p->operand[i + 1].predicate (op, mode);
31376 if (last_arg_constant && (i + 1) == nargs)
31378 if (!match)
31380 if (icode == CODE_FOR_lwp_lwpvalsi3
31381 || icode == CODE_FOR_lwp_lwpinssi3
31382 || icode == CODE_FOR_lwp_lwpvaldi3
31383 || icode == CODE_FOR_lwp_lwpinsdi3)
31384 error ("the last argument must be a 32-bit immediate");
31385 else
31386 error ("the last argument must be an 8-bit immediate");
31387 return const0_rtx;
31390 else
31392 if (i == memory)
31394 /* This must be the memory operand. */
31395 op = force_reg (Pmode, convert_to_mode (Pmode, op, 1));
31396 op = gen_rtx_MEM (mode, op);
31397 gcc_assert (GET_MODE (op) == mode
31398 || GET_MODE (op) == VOIDmode);
31400 else
31402 /* This must be register. */
31403 if (VECTOR_MODE_P (mode))
31404 op = safe_vector_operand (op, mode);
31406 gcc_assert (GET_MODE (op) == mode
31407 || GET_MODE (op) == VOIDmode);
31408 op = copy_to_mode_reg (mode, op);
31412 args[i].op = op;
31413 args[i].mode = mode;
31416 switch (nargs)
31418 case 0:
31419 pat = GEN_FCN (icode) (target);
31420 break;
31421 case 1:
31422 pat = GEN_FCN (icode) (target, args[0].op);
31423 break;
31424 case 2:
31425 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
31426 break;
31427 case 3:
31428 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
31429 break;
31430 default:
31431 gcc_unreachable ();
31434 if (! pat)
31435 return 0;
31436 emit_insn (pat);
31437 return klass == store ? 0 : target;
31440 /* Return the integer constant in ARG. Constrain it to be in the range
31441 of the subparts of VEC_TYPE; issue an error if not. */
31443 static int
31444 get_element_number (tree vec_type, tree arg)
31446 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
31448 if (!host_integerp (arg, 1)
31449 || (elt = tree_low_cst (arg, 1), elt > max))
31451 error ("selector must be an integer constant in the range 0..%wi", max);
31452 return 0;
31455 return elt;
31458 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
31459 ix86_expand_vector_init. We DO have language-level syntax for this, in
31460 the form of (type){ init-list }. Except that since we can't place emms
31461 instructions from inside the compiler, we can't allow the use of MMX
31462 registers unless the user explicitly asks for it. So we do *not* define
31463 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
31464 we have builtins invoked by mmintrin.h that gives us license to emit
31465 these sorts of instructions. */
31467 static rtx
31468 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
31470 enum machine_mode tmode = TYPE_MODE (type);
31471 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
31472 int i, n_elt = GET_MODE_NUNITS (tmode);
31473 rtvec v = rtvec_alloc (n_elt);
31475 gcc_assert (VECTOR_MODE_P (tmode));
31476 gcc_assert (call_expr_nargs (exp) == n_elt);
31478 for (i = 0; i < n_elt; ++i)
31480 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
31481 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
31484 if (!target || !register_operand (target, tmode))
31485 target = gen_reg_rtx (tmode);
31487 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
31488 return target;
31491 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
31492 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
31493 had a language-level syntax for referencing vector elements. */
31495 static rtx
31496 ix86_expand_vec_ext_builtin (tree exp, rtx target)
31498 enum machine_mode tmode, mode0;
31499 tree arg0, arg1;
31500 int elt;
31501 rtx op0;
31503 arg0 = CALL_EXPR_ARG (exp, 0);
31504 arg1 = CALL_EXPR_ARG (exp, 1);
31506 op0 = expand_normal (arg0);
31507 elt = get_element_number (TREE_TYPE (arg0), arg1);
31509 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
31510 mode0 = TYPE_MODE (TREE_TYPE (arg0));
31511 gcc_assert (VECTOR_MODE_P (mode0));
31513 op0 = force_reg (mode0, op0);
31515 if (optimize || !target || !register_operand (target, tmode))
31516 target = gen_reg_rtx (tmode);
31518 ix86_expand_vector_extract (true, target, op0, elt);
31520 return target;
31523 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
31524 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
31525 a language-level syntax for referencing vector elements. */
31527 static rtx
31528 ix86_expand_vec_set_builtin (tree exp)
31530 enum machine_mode tmode, mode1;
31531 tree arg0, arg1, arg2;
31532 int elt;
31533 rtx op0, op1, target;
31535 arg0 = CALL_EXPR_ARG (exp, 0);
31536 arg1 = CALL_EXPR_ARG (exp, 1);
31537 arg2 = CALL_EXPR_ARG (exp, 2);
31539 tmode = TYPE_MODE (TREE_TYPE (arg0));
31540 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
31541 gcc_assert (VECTOR_MODE_P (tmode));
31543 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
31544 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
31545 elt = get_element_number (TREE_TYPE (arg0), arg2);
31547 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
31548 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
31550 op0 = force_reg (tmode, op0);
31551 op1 = force_reg (mode1, op1);
31553 /* OP0 is the source of these builtin functions and shouldn't be
31554 modified. Create a copy, use it and return it as target. */
31555 target = gen_reg_rtx (tmode);
31556 emit_move_insn (target, op0);
31557 ix86_expand_vector_set (true, target, op1, elt);
31559 return target;
31562 /* Expand an expression EXP that calls a built-in function,
31563 with result going to TARGET if that's convenient
31564 (and in mode MODE if that's convenient).
31565 SUBTARGET may be used as the target for computing one of EXP's operands.
31566 IGNORE is nonzero if the value is to be ignored. */
31568 static rtx
31569 ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
31570 enum machine_mode mode ATTRIBUTE_UNUSED,
31571 int ignore ATTRIBUTE_UNUSED)
31573 const struct builtin_description *d;
31574 size_t i;
31575 enum insn_code icode;
31576 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
31577 tree arg0, arg1, arg2, arg3, arg4;
31578 rtx op0, op1, op2, op3, op4, pat, insn;
31579 enum machine_mode mode0, mode1, mode2, mode3, mode4;
31580 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
31582 /* For CPU builtins that can be folded, fold first and expand the fold. */
31583 switch (fcode)
31585 case IX86_BUILTIN_CPU_INIT:
31587 /* Make it call __cpu_indicator_init in libgcc. */
31588 tree call_expr, fndecl, type;
31589 type = build_function_type_list (integer_type_node, NULL_TREE);
31590 fndecl = build_fn_decl ("__cpu_indicator_init", type);
31591 call_expr = build_call_expr (fndecl, 0);
31592 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
31594 case IX86_BUILTIN_CPU_IS:
31595 case IX86_BUILTIN_CPU_SUPPORTS:
31597 tree arg0 = CALL_EXPR_ARG (exp, 0);
31598 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
31599 gcc_assert (fold_expr != NULL_TREE);
31600 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
31604 /* Determine whether the builtin function is available under the current ISA.
31605 Originally the builtin was not created if it wasn't applicable to the
31606 current ISA based on the command line switches. With function specific
31607 options, we need to check in the context of the function making the call
31608 whether it is supported. */
31609 if (ix86_builtins_isa[fcode].isa
31610 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
31612 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
31613 NULL, (enum fpmath_unit) 0, false);
31615 if (!opts)
31616 error ("%qE needs unknown isa option", fndecl);
31617 else
31619 gcc_assert (opts != NULL);
31620 error ("%qE needs isa option %s", fndecl, opts);
31621 free (opts);
31623 return const0_rtx;
31626 switch (fcode)
31628 case IX86_BUILTIN_MASKMOVQ:
31629 case IX86_BUILTIN_MASKMOVDQU:
31630 icode = (fcode == IX86_BUILTIN_MASKMOVQ
31631 ? CODE_FOR_mmx_maskmovq
31632 : CODE_FOR_sse2_maskmovdqu);
31633 /* Note the arg order is different from the operand order. */
31634 arg1 = CALL_EXPR_ARG (exp, 0);
31635 arg2 = CALL_EXPR_ARG (exp, 1);
31636 arg0 = CALL_EXPR_ARG (exp, 2);
31637 op0 = expand_normal (arg0);
31638 op1 = expand_normal (arg1);
31639 op2 = expand_normal (arg2);
31640 mode0 = insn_data[icode].operand[0].mode;
31641 mode1 = insn_data[icode].operand[1].mode;
31642 mode2 = insn_data[icode].operand[2].mode;
31644 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
31645 op0 = gen_rtx_MEM (mode1, op0);
31647 if (!insn_data[icode].operand[0].predicate (op0, mode0))
31648 op0 = copy_to_mode_reg (mode0, op0);
31649 if (!insn_data[icode].operand[1].predicate (op1, mode1))
31650 op1 = copy_to_mode_reg (mode1, op1);
31651 if (!insn_data[icode].operand[2].predicate (op2, mode2))
31652 op2 = copy_to_mode_reg (mode2, op2);
31653 pat = GEN_FCN (icode) (op0, op1, op2);
31654 if (! pat)
31655 return 0;
31656 emit_insn (pat);
31657 return 0;
31659 case IX86_BUILTIN_LDMXCSR:
31660 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
31661 target = assign_386_stack_local (SImode, SLOT_TEMP);
31662 emit_move_insn (target, op0);
31663 emit_insn (gen_sse_ldmxcsr (target));
31664 return 0;
31666 case IX86_BUILTIN_STMXCSR:
31667 target = assign_386_stack_local (SImode, SLOT_TEMP);
31668 emit_insn (gen_sse_stmxcsr (target));
31669 return copy_to_mode_reg (SImode, target);
31671 case IX86_BUILTIN_CLFLUSH:
31672 arg0 = CALL_EXPR_ARG (exp, 0);
31673 op0 = expand_normal (arg0);
31674 icode = CODE_FOR_sse2_clflush;
31675 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
31676 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
31678 emit_insn (gen_sse2_clflush (op0));
31679 return 0;
31681 case IX86_BUILTIN_MONITOR:
31682 arg0 = CALL_EXPR_ARG (exp, 0);
31683 arg1 = CALL_EXPR_ARG (exp, 1);
31684 arg2 = CALL_EXPR_ARG (exp, 2);
31685 op0 = expand_normal (arg0);
31686 op1 = expand_normal (arg1);
31687 op2 = expand_normal (arg2);
31688 if (!REG_P (op0))
31689 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
31690 if (!REG_P (op1))
31691 op1 = copy_to_mode_reg (SImode, op1);
31692 if (!REG_P (op2))
31693 op2 = copy_to_mode_reg (SImode, op2);
31694 emit_insn (ix86_gen_monitor (op0, op1, op2));
31695 return 0;
31697 case IX86_BUILTIN_MWAIT:
31698 arg0 = CALL_EXPR_ARG (exp, 0);
31699 arg1 = CALL_EXPR_ARG (exp, 1);
31700 op0 = expand_normal (arg0);
31701 op1 = expand_normal (arg1);
31702 if (!REG_P (op0))
31703 op0 = copy_to_mode_reg (SImode, op0);
31704 if (!REG_P (op1))
31705 op1 = copy_to_mode_reg (SImode, op1);
31706 emit_insn (gen_sse3_mwait (op0, op1));
31707 return 0;
31709 case IX86_BUILTIN_VEC_INIT_V2SI:
31710 case IX86_BUILTIN_VEC_INIT_V4HI:
31711 case IX86_BUILTIN_VEC_INIT_V8QI:
31712 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
31714 case IX86_BUILTIN_VEC_EXT_V2DF:
31715 case IX86_BUILTIN_VEC_EXT_V2DI:
31716 case IX86_BUILTIN_VEC_EXT_V4SF:
31717 case IX86_BUILTIN_VEC_EXT_V4SI:
31718 case IX86_BUILTIN_VEC_EXT_V8HI:
31719 case IX86_BUILTIN_VEC_EXT_V2SI:
31720 case IX86_BUILTIN_VEC_EXT_V4HI:
31721 case IX86_BUILTIN_VEC_EXT_V16QI:
31722 return ix86_expand_vec_ext_builtin (exp, target);
31724 case IX86_BUILTIN_VEC_SET_V2DI:
31725 case IX86_BUILTIN_VEC_SET_V4SF:
31726 case IX86_BUILTIN_VEC_SET_V4SI:
31727 case IX86_BUILTIN_VEC_SET_V8HI:
31728 case IX86_BUILTIN_VEC_SET_V4HI:
31729 case IX86_BUILTIN_VEC_SET_V16QI:
31730 return ix86_expand_vec_set_builtin (exp);
31732 case IX86_BUILTIN_INFQ:
31733 case IX86_BUILTIN_HUGE_VALQ:
31735 REAL_VALUE_TYPE inf;
31736 rtx tmp;
31738 real_inf (&inf);
31739 tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
31741 tmp = validize_mem (force_const_mem (mode, tmp));
31743 if (target == 0)
31744 target = gen_reg_rtx (mode);
31746 emit_move_insn (target, tmp);
31747 return target;
31750 case IX86_BUILTIN_RDPMC:
31751 case IX86_BUILTIN_RDTSC:
31752 case IX86_BUILTIN_RDTSCP:
31754 op0 = gen_reg_rtx (DImode);
31755 op1 = gen_reg_rtx (DImode);
31757 if (fcode == IX86_BUILTIN_RDPMC)
31759 arg0 = CALL_EXPR_ARG (exp, 0);
31760 op2 = expand_normal (arg0);
31761 if (!register_operand (op2, SImode))
31762 op2 = copy_to_mode_reg (SImode, op2);
31764 insn = (TARGET_64BIT
31765 ? gen_rdpmc_rex64 (op0, op1, op2)
31766 : gen_rdpmc (op0, op2));
31767 emit_insn (insn);
31769 else if (fcode == IX86_BUILTIN_RDTSC)
31771 insn = (TARGET_64BIT
31772 ? gen_rdtsc_rex64 (op0, op1)
31773 : gen_rdtsc (op0));
31774 emit_insn (insn);
31776 else
31778 op2 = gen_reg_rtx (SImode);
31780 insn = (TARGET_64BIT
31781 ? gen_rdtscp_rex64 (op0, op1, op2)
31782 : gen_rdtscp (op0, op2));
31783 emit_insn (insn);
31785 arg0 = CALL_EXPR_ARG (exp, 0);
31786 op4 = expand_normal (arg0);
31787 if (!address_operand (op4, VOIDmode))
31789 op4 = convert_memory_address (Pmode, op4);
31790 op4 = copy_addr_to_reg (op4);
31792 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
31795 if (target == 0)
31796 target = gen_reg_rtx (mode);
31798 if (TARGET_64BIT)
31800 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
31801 op1, 1, OPTAB_DIRECT);
31802 op0 = expand_simple_binop (DImode, IOR, op0, op1,
31803 op0, 1, OPTAB_DIRECT);
31806 emit_move_insn (target, op0);
31807 return target;
31809 case IX86_BUILTIN_FXSAVE:
31810 case IX86_BUILTIN_FXRSTOR:
31811 case IX86_BUILTIN_FXSAVE64:
31812 case IX86_BUILTIN_FXRSTOR64:
31813 switch (fcode)
31815 case IX86_BUILTIN_FXSAVE:
31816 icode = CODE_FOR_fxsave;
31817 break;
31818 case IX86_BUILTIN_FXRSTOR:
31819 icode = CODE_FOR_fxrstor;
31820 break;
31821 case IX86_BUILTIN_FXSAVE64:
31822 icode = CODE_FOR_fxsave64;
31823 break;
31824 case IX86_BUILTIN_FXRSTOR64:
31825 icode = CODE_FOR_fxrstor64;
31826 break;
31827 default:
31828 gcc_unreachable ();
31831 arg0 = CALL_EXPR_ARG (exp, 0);
31832 op0 = expand_normal (arg0);
31834 if (!address_operand (op0, VOIDmode))
31836 op0 = convert_memory_address (Pmode, op0);
31837 op0 = copy_addr_to_reg (op0);
31839 op0 = gen_rtx_MEM (BLKmode, op0);
31841 pat = GEN_FCN (icode) (op0);
31842 if (pat)
31843 emit_insn (pat);
31844 return 0;
31846 case IX86_BUILTIN_XSAVE:
31847 case IX86_BUILTIN_XRSTOR:
31848 case IX86_BUILTIN_XSAVE64:
31849 case IX86_BUILTIN_XRSTOR64:
31850 case IX86_BUILTIN_XSAVEOPT:
31851 case IX86_BUILTIN_XSAVEOPT64:
31852 arg0 = CALL_EXPR_ARG (exp, 0);
31853 arg1 = CALL_EXPR_ARG (exp, 1);
31854 op0 = expand_normal (arg0);
31855 op1 = expand_normal (arg1);
31857 if (!address_operand (op0, VOIDmode))
31859 op0 = convert_memory_address (Pmode, op0);
31860 op0 = copy_addr_to_reg (op0);
31862 op0 = gen_rtx_MEM (BLKmode, op0);
31864 op1 = force_reg (DImode, op1);
31866 if (TARGET_64BIT)
31868 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
31869 NULL, 1, OPTAB_DIRECT);
31870 switch (fcode)
31872 case IX86_BUILTIN_XSAVE:
31873 icode = CODE_FOR_xsave_rex64;
31874 break;
31875 case IX86_BUILTIN_XRSTOR:
31876 icode = CODE_FOR_xrstor_rex64;
31877 break;
31878 case IX86_BUILTIN_XSAVE64:
31879 icode = CODE_FOR_xsave64;
31880 break;
31881 case IX86_BUILTIN_XRSTOR64:
31882 icode = CODE_FOR_xrstor64;
31883 break;
31884 case IX86_BUILTIN_XSAVEOPT:
31885 icode = CODE_FOR_xsaveopt_rex64;
31886 break;
31887 case IX86_BUILTIN_XSAVEOPT64:
31888 icode = CODE_FOR_xsaveopt64;
31889 break;
31890 default:
31891 gcc_unreachable ();
31894 op2 = gen_lowpart (SImode, op2);
31895 op1 = gen_lowpart (SImode, op1);
31896 pat = GEN_FCN (icode) (op0, op1, op2);
31898 else
31900 switch (fcode)
31902 case IX86_BUILTIN_XSAVE:
31903 icode = CODE_FOR_xsave;
31904 break;
31905 case IX86_BUILTIN_XRSTOR:
31906 icode = CODE_FOR_xrstor;
31907 break;
31908 case IX86_BUILTIN_XSAVEOPT:
31909 icode = CODE_FOR_xsaveopt;
31910 break;
31911 default:
31912 gcc_unreachable ();
31914 pat = GEN_FCN (icode) (op0, op1);
31917 if (pat)
31918 emit_insn (pat);
31919 return 0;
31921 case IX86_BUILTIN_LLWPCB:
31922 arg0 = CALL_EXPR_ARG (exp, 0);
31923 op0 = expand_normal (arg0);
31924 icode = CODE_FOR_lwp_llwpcb;
31925 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
31926 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
31927 emit_insn (gen_lwp_llwpcb (op0));
31928 return 0;
31930 case IX86_BUILTIN_SLWPCB:
31931 icode = CODE_FOR_lwp_slwpcb;
31932 if (!target
31933 || !insn_data[icode].operand[0].predicate (target, Pmode))
31934 target = gen_reg_rtx (Pmode);
31935 emit_insn (gen_lwp_slwpcb (target));
31936 return target;
31938 case IX86_BUILTIN_BEXTRI32:
31939 case IX86_BUILTIN_BEXTRI64:
31940 arg0 = CALL_EXPR_ARG (exp, 0);
31941 arg1 = CALL_EXPR_ARG (exp, 1);
31942 op0 = expand_normal (arg0);
31943 op1 = expand_normal (arg1);
31944 icode = (fcode == IX86_BUILTIN_BEXTRI32
31945 ? CODE_FOR_tbm_bextri_si
31946 : CODE_FOR_tbm_bextri_di);
31947 if (!CONST_INT_P (op1))
31949 error ("last argument must be an immediate");
31950 return const0_rtx;
31952 else
31954 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
31955 unsigned char lsb_index = INTVAL (op1) & 0xFF;
31956 op1 = GEN_INT (length);
31957 op2 = GEN_INT (lsb_index);
31958 pat = GEN_FCN (icode) (target, op0, op1, op2);
31959 if (pat)
31960 emit_insn (pat);
31961 return target;
31964 case IX86_BUILTIN_RDRAND16_STEP:
31965 icode = CODE_FOR_rdrandhi_1;
31966 mode0 = HImode;
31967 goto rdrand_step;
31969 case IX86_BUILTIN_RDRAND32_STEP:
31970 icode = CODE_FOR_rdrandsi_1;
31971 mode0 = SImode;
31972 goto rdrand_step;
31974 case IX86_BUILTIN_RDRAND64_STEP:
31975 icode = CODE_FOR_rdranddi_1;
31976 mode0 = DImode;
31978 rdrand_step:
31979 op0 = gen_reg_rtx (mode0);
31980 emit_insn (GEN_FCN (icode) (op0));
31982 arg0 = CALL_EXPR_ARG (exp, 0);
31983 op1 = expand_normal (arg0);
31984 if (!address_operand (op1, VOIDmode))
31986 op1 = convert_memory_address (Pmode, op1);
31987 op1 = copy_addr_to_reg (op1);
31989 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
31991 op1 = gen_reg_rtx (SImode);
31992 emit_move_insn (op1, CONST1_RTX (SImode));
31994 /* Emit SImode conditional move. */
31995 if (mode0 == HImode)
31997 op2 = gen_reg_rtx (SImode);
31998 emit_insn (gen_zero_extendhisi2 (op2, op0));
32000 else if (mode0 == SImode)
32001 op2 = op0;
32002 else
32003 op2 = gen_rtx_SUBREG (SImode, op0, 0);
32005 if (target == 0)
32006 target = gen_reg_rtx (SImode);
32008 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
32009 const0_rtx);
32010 emit_insn (gen_rtx_SET (VOIDmode, target,
32011 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
32012 return target;
32014 case IX86_BUILTIN_RDSEED16_STEP:
32015 icode = CODE_FOR_rdseedhi_1;
32016 mode0 = HImode;
32017 goto rdseed_step;
32019 case IX86_BUILTIN_RDSEED32_STEP:
32020 icode = CODE_FOR_rdseedsi_1;
32021 mode0 = SImode;
32022 goto rdseed_step;
32024 case IX86_BUILTIN_RDSEED64_STEP:
32025 icode = CODE_FOR_rdseeddi_1;
32026 mode0 = DImode;
32028 rdseed_step:
32029 op0 = gen_reg_rtx (mode0);
32030 emit_insn (GEN_FCN (icode) (op0));
32032 arg0 = CALL_EXPR_ARG (exp, 0);
32033 op1 = expand_normal (arg0);
32034 if (!address_operand (op1, VOIDmode))
32036 op1 = convert_memory_address (Pmode, op1);
32037 op1 = copy_addr_to_reg (op1);
32039 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
32041 op2 = gen_reg_rtx (QImode);
32043 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
32044 const0_rtx);
32045 emit_insn (gen_rtx_SET (VOIDmode, op2, pat));
32047 if (target == 0)
32048 target = gen_reg_rtx (SImode);
32050 emit_insn (gen_zero_extendqisi2 (target, op2));
32051 return target;
32053 case IX86_BUILTIN_ADDCARRYX32:
32054 icode = TARGET_ADX ? CODE_FOR_adcxsi3 : CODE_FOR_addsi3_carry;
32055 mode0 = SImode;
32056 goto addcarryx;
32058 case IX86_BUILTIN_ADDCARRYX64:
32059 icode = TARGET_ADX ? CODE_FOR_adcxdi3 : CODE_FOR_adddi3_carry;
32060 mode0 = DImode;
32062 addcarryx:
32063 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
32064 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
32065 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
32066 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
32068 op0 = gen_reg_rtx (QImode);
32070 /* Generate CF from input operand. */
32071 op1 = expand_normal (arg0);
32072 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
32073 emit_insn (gen_addqi3_cc (op0, op1, constm1_rtx));
32075 /* Gen ADCX instruction to compute X+Y+CF. */
32076 op2 = expand_normal (arg1);
32077 op3 = expand_normal (arg2);
32079 if (!REG_P (op2))
32080 op2 = copy_to_mode_reg (mode0, op2);
32081 if (!REG_P (op3))
32082 op3 = copy_to_mode_reg (mode0, op3);
32084 op0 = gen_reg_rtx (mode0);
32086 op4 = gen_rtx_REG (CCCmode, FLAGS_REG);
32087 pat = gen_rtx_LTU (VOIDmode, op4, const0_rtx);
32088 emit_insn (GEN_FCN (icode) (op0, op2, op3, op4, pat));
32090 /* Store the result. */
32091 op4 = expand_normal (arg3);
32092 if (!address_operand (op4, VOIDmode))
32094 op4 = convert_memory_address (Pmode, op4);
32095 op4 = copy_addr_to_reg (op4);
32097 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
32099 /* Return current CF value. */
32100 if (target == 0)
32101 target = gen_reg_rtx (QImode);
32103 PUT_MODE (pat, QImode);
32104 emit_insn (gen_rtx_SET (VOIDmode, target, pat));
32105 return target;
32107 case IX86_BUILTIN_GATHERSIV2DF:
32108 icode = CODE_FOR_avx2_gathersiv2df;
32109 goto gather_gen;
32110 case IX86_BUILTIN_GATHERSIV4DF:
32111 icode = CODE_FOR_avx2_gathersiv4df;
32112 goto gather_gen;
32113 case IX86_BUILTIN_GATHERDIV2DF:
32114 icode = CODE_FOR_avx2_gatherdiv2df;
32115 goto gather_gen;
32116 case IX86_BUILTIN_GATHERDIV4DF:
32117 icode = CODE_FOR_avx2_gatherdiv4df;
32118 goto gather_gen;
32119 case IX86_BUILTIN_GATHERSIV4SF:
32120 icode = CODE_FOR_avx2_gathersiv4sf;
32121 goto gather_gen;
32122 case IX86_BUILTIN_GATHERSIV8SF:
32123 icode = CODE_FOR_avx2_gathersiv8sf;
32124 goto gather_gen;
32125 case IX86_BUILTIN_GATHERDIV4SF:
32126 icode = CODE_FOR_avx2_gatherdiv4sf;
32127 goto gather_gen;
32128 case IX86_BUILTIN_GATHERDIV8SF:
32129 icode = CODE_FOR_avx2_gatherdiv8sf;
32130 goto gather_gen;
32131 case IX86_BUILTIN_GATHERSIV2DI:
32132 icode = CODE_FOR_avx2_gathersiv2di;
32133 goto gather_gen;
32134 case IX86_BUILTIN_GATHERSIV4DI:
32135 icode = CODE_FOR_avx2_gathersiv4di;
32136 goto gather_gen;
32137 case IX86_BUILTIN_GATHERDIV2DI:
32138 icode = CODE_FOR_avx2_gatherdiv2di;
32139 goto gather_gen;
32140 case IX86_BUILTIN_GATHERDIV4DI:
32141 icode = CODE_FOR_avx2_gatherdiv4di;
32142 goto gather_gen;
32143 case IX86_BUILTIN_GATHERSIV4SI:
32144 icode = CODE_FOR_avx2_gathersiv4si;
32145 goto gather_gen;
32146 case IX86_BUILTIN_GATHERSIV8SI:
32147 icode = CODE_FOR_avx2_gathersiv8si;
32148 goto gather_gen;
32149 case IX86_BUILTIN_GATHERDIV4SI:
32150 icode = CODE_FOR_avx2_gatherdiv4si;
32151 goto gather_gen;
32152 case IX86_BUILTIN_GATHERDIV8SI:
32153 icode = CODE_FOR_avx2_gatherdiv8si;
32154 goto gather_gen;
32155 case IX86_BUILTIN_GATHERALTSIV4DF:
32156 icode = CODE_FOR_avx2_gathersiv4df;
32157 goto gather_gen;
32158 case IX86_BUILTIN_GATHERALTDIV8SF:
32159 icode = CODE_FOR_avx2_gatherdiv8sf;
32160 goto gather_gen;
32161 case IX86_BUILTIN_GATHERALTSIV4DI:
32162 icode = CODE_FOR_avx2_gathersiv4di;
32163 goto gather_gen;
32164 case IX86_BUILTIN_GATHERALTDIV8SI:
32165 icode = CODE_FOR_avx2_gatherdiv8si;
32166 goto gather_gen;
32168 gather_gen:
32169 arg0 = CALL_EXPR_ARG (exp, 0);
32170 arg1 = CALL_EXPR_ARG (exp, 1);
32171 arg2 = CALL_EXPR_ARG (exp, 2);
32172 arg3 = CALL_EXPR_ARG (exp, 3);
32173 arg4 = CALL_EXPR_ARG (exp, 4);
32174 op0 = expand_normal (arg0);
32175 op1 = expand_normal (arg1);
32176 op2 = expand_normal (arg2);
32177 op3 = expand_normal (arg3);
32178 op4 = expand_normal (arg4);
32179 /* Note the arg order is different from the operand order. */
32180 mode0 = insn_data[icode].operand[1].mode;
32181 mode2 = insn_data[icode].operand[3].mode;
32182 mode3 = insn_data[icode].operand[4].mode;
32183 mode4 = insn_data[icode].operand[5].mode;
32185 if (target == NULL_RTX
32186 || GET_MODE (target) != insn_data[icode].operand[0].mode)
32187 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
32188 else
32189 subtarget = target;
32191 if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
32192 || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
32194 rtx half = gen_reg_rtx (V4SImode);
32195 if (!nonimmediate_operand (op2, V8SImode))
32196 op2 = copy_to_mode_reg (V8SImode, op2);
32197 emit_insn (gen_vec_extract_lo_v8si (half, op2));
32198 op2 = half;
32200 else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
32201 || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
32203 rtx (*gen) (rtx, rtx);
32204 rtx half = gen_reg_rtx (mode0);
32205 if (mode0 == V4SFmode)
32206 gen = gen_vec_extract_lo_v8sf;
32207 else
32208 gen = gen_vec_extract_lo_v8si;
32209 if (!nonimmediate_operand (op0, GET_MODE (op0)))
32210 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
32211 emit_insn (gen (half, op0));
32212 op0 = half;
32213 if (!nonimmediate_operand (op3, GET_MODE (op3)))
32214 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
32215 emit_insn (gen (half, op3));
32216 op3 = half;
32219 /* Force memory operand only with base register here. But we
32220 don't want to do it on memory operand for other builtin
32221 functions. */
32222 op1 = force_reg (Pmode, convert_to_mode (Pmode, op1, 1));
32224 if (!insn_data[icode].operand[1].predicate (op0, mode0))
32225 op0 = copy_to_mode_reg (mode0, op0);
32226 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
32227 op1 = copy_to_mode_reg (Pmode, op1);
32228 if (!insn_data[icode].operand[3].predicate (op2, mode2))
32229 op2 = copy_to_mode_reg (mode2, op2);
32230 if (!insn_data[icode].operand[4].predicate (op3, mode3))
32231 op3 = copy_to_mode_reg (mode3, op3);
32232 if (!insn_data[icode].operand[5].predicate (op4, mode4))
32234 error ("last argument must be scale 1, 2, 4, 8");
32235 return const0_rtx;
32238 /* Optimize. If mask is known to have all high bits set,
32239 replace op0 with pc_rtx to signal that the instruction
32240 overwrites the whole destination and doesn't use its
32241 previous contents. */
32242 if (optimize)
32244 if (TREE_CODE (arg3) == VECTOR_CST)
32246 unsigned int negative = 0;
32247 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
32249 tree cst = VECTOR_CST_ELT (arg3, i);
32250 if (TREE_CODE (cst) == INTEGER_CST
32251 && tree_int_cst_sign_bit (cst))
32252 negative++;
32253 else if (TREE_CODE (cst) == REAL_CST
32254 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
32255 negative++;
32257 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
32258 op0 = pc_rtx;
32260 else if (TREE_CODE (arg3) == SSA_NAME)
32262 /* Recognize also when mask is like:
32263 __v2df src = _mm_setzero_pd ();
32264 __v2df mask = _mm_cmpeq_pd (src, src);
32266 __v8sf src = _mm256_setzero_ps ();
32267 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
32268 as that is a cheaper way to load all ones into
32269 a register than having to load a constant from
32270 memory. */
32271 gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
32272 if (is_gimple_call (def_stmt))
32274 tree fndecl = gimple_call_fndecl (def_stmt);
32275 if (fndecl
32276 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
32277 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
32279 case IX86_BUILTIN_CMPPD:
32280 case IX86_BUILTIN_CMPPS:
32281 case IX86_BUILTIN_CMPPD256:
32282 case IX86_BUILTIN_CMPPS256:
32283 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
32284 break;
32285 /* FALLTHRU */
32286 case IX86_BUILTIN_CMPEQPD:
32287 case IX86_BUILTIN_CMPEQPS:
32288 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
32289 && initializer_zerop (gimple_call_arg (def_stmt,
32290 1)))
32291 op0 = pc_rtx;
32292 break;
32293 default:
32294 break;
32300 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
32301 if (! pat)
32302 return const0_rtx;
32303 emit_insn (pat);
32305 if (fcode == IX86_BUILTIN_GATHERDIV8SF
32306 || fcode == IX86_BUILTIN_GATHERDIV8SI)
32308 enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
32309 ? V4SFmode : V4SImode;
32310 if (target == NULL_RTX)
32311 target = gen_reg_rtx (tmode);
32312 if (tmode == V4SFmode)
32313 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
32314 else
32315 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
32317 else
32318 target = subtarget;
32320 return target;
32322 case IX86_BUILTIN_XABORT:
32323 icode = CODE_FOR_xabort;
32324 arg0 = CALL_EXPR_ARG (exp, 0);
32325 op0 = expand_normal (arg0);
32326 mode0 = insn_data[icode].operand[0].mode;
32327 if (!insn_data[icode].operand[0].predicate (op0, mode0))
32329 error ("the xabort's argument must be an 8-bit immediate");
32330 return const0_rtx;
32332 emit_insn (gen_xabort (op0));
32333 return 0;
32335 default:
32336 break;
32339 for (i = 0, d = bdesc_special_args;
32340 i < ARRAY_SIZE (bdesc_special_args);
32341 i++, d++)
32342 if (d->code == fcode)
32343 return ix86_expand_special_args_builtin (d, exp, target);
32345 for (i = 0, d = bdesc_args;
32346 i < ARRAY_SIZE (bdesc_args);
32347 i++, d++)
32348 if (d->code == fcode)
32349 switch (fcode)
32351 case IX86_BUILTIN_FABSQ:
32352 case IX86_BUILTIN_COPYSIGNQ:
32353 if (!TARGET_SSE)
32354 /* Emit a normal call if SSE isn't available. */
32355 return expand_call (exp, target, ignore);
32356 default:
32357 return ix86_expand_args_builtin (d, exp, target);
32360 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
32361 if (d->code == fcode)
32362 return ix86_expand_sse_comi (d, exp, target);
32364 for (i = 0, d = bdesc_pcmpestr;
32365 i < ARRAY_SIZE (bdesc_pcmpestr);
32366 i++, d++)
32367 if (d->code == fcode)
32368 return ix86_expand_sse_pcmpestr (d, exp, target);
32370 for (i = 0, d = bdesc_pcmpistr;
32371 i < ARRAY_SIZE (bdesc_pcmpistr);
32372 i++, d++)
32373 if (d->code == fcode)
32374 return ix86_expand_sse_pcmpistr (d, exp, target);
32376 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
32377 if (d->code == fcode)
32378 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
32379 (enum ix86_builtin_func_type)
32380 d->flag, d->comparison);
32382 gcc_unreachable ();
32385 /* Returns a function decl for a vectorized version of the builtin function
32386 with builtin function code FN and the result vector type TYPE, or NULL_TREE
32387 if it is not available. */
32389 static tree
32390 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
32391 tree type_in)
32393 enum machine_mode in_mode, out_mode;
32394 int in_n, out_n;
32395 enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
32397 if (TREE_CODE (type_out) != VECTOR_TYPE
32398 || TREE_CODE (type_in) != VECTOR_TYPE
32399 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
32400 return NULL_TREE;
32402 out_mode = TYPE_MODE (TREE_TYPE (type_out));
32403 out_n = TYPE_VECTOR_SUBPARTS (type_out);
32404 in_mode = TYPE_MODE (TREE_TYPE (type_in));
32405 in_n = TYPE_VECTOR_SUBPARTS (type_in);
32407 switch (fn)
32409 case BUILT_IN_SQRT:
32410 if (out_mode == DFmode && in_mode == DFmode)
32412 if (out_n == 2 && in_n == 2)
32413 return ix86_builtins[IX86_BUILTIN_SQRTPD];
32414 else if (out_n == 4 && in_n == 4)
32415 return ix86_builtins[IX86_BUILTIN_SQRTPD256];
32417 break;
32419 case BUILT_IN_SQRTF:
32420 if (out_mode == SFmode && in_mode == SFmode)
32422 if (out_n == 4 && in_n == 4)
32423 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
32424 else if (out_n == 8 && in_n == 8)
32425 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
32427 break;
32429 case BUILT_IN_IFLOOR:
32430 case BUILT_IN_LFLOOR:
32431 case BUILT_IN_LLFLOOR:
32432 /* The round insn does not trap on denormals. */
32433 if (flag_trapping_math || !TARGET_ROUND)
32434 break;
32436 if (out_mode == SImode && in_mode == DFmode)
32438 if (out_n == 4 && in_n == 2)
32439 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX];
32440 else if (out_n == 8 && in_n == 4)
32441 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256];
32443 break;
32445 case BUILT_IN_IFLOORF:
32446 case BUILT_IN_LFLOORF:
32447 case BUILT_IN_LLFLOORF:
32448 /* The round insn does not trap on denormals. */
32449 if (flag_trapping_math || !TARGET_ROUND)
32450 break;
32452 if (out_mode == SImode && in_mode == SFmode)
32454 if (out_n == 4 && in_n == 4)
32455 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX];
32456 else if (out_n == 8 && in_n == 8)
32457 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX256];
32459 break;
32461 case BUILT_IN_ICEIL:
32462 case BUILT_IN_LCEIL:
32463 case BUILT_IN_LLCEIL:
32464 /* The round insn does not trap on denormals. */
32465 if (flag_trapping_math || !TARGET_ROUND)
32466 break;
32468 if (out_mode == SImode && in_mode == DFmode)
32470 if (out_n == 4 && in_n == 2)
32471 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX];
32472 else if (out_n == 8 && in_n == 4)
32473 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256];
32475 break;
32477 case BUILT_IN_ICEILF:
32478 case BUILT_IN_LCEILF:
32479 case BUILT_IN_LLCEILF:
32480 /* The round insn does not trap on denormals. */
32481 if (flag_trapping_math || !TARGET_ROUND)
32482 break;
32484 if (out_mode == SImode && in_mode == SFmode)
32486 if (out_n == 4 && in_n == 4)
32487 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX];
32488 else if (out_n == 8 && in_n == 8)
32489 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX256];
32491 break;
32493 case BUILT_IN_IRINT:
32494 case BUILT_IN_LRINT:
32495 case BUILT_IN_LLRINT:
32496 if (out_mode == SImode && in_mode == DFmode)
32498 if (out_n == 4 && in_n == 2)
32499 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
32500 else if (out_n == 8 && in_n == 4)
32501 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX256];
32503 break;
32505 case BUILT_IN_IRINTF:
32506 case BUILT_IN_LRINTF:
32507 case BUILT_IN_LLRINTF:
32508 if (out_mode == SImode && in_mode == SFmode)
32510 if (out_n == 4 && in_n == 4)
32511 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
32512 else if (out_n == 8 && in_n == 8)
32513 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
32515 break;
32517 case BUILT_IN_IROUND:
32518 case BUILT_IN_LROUND:
32519 case BUILT_IN_LLROUND:
32520 /* The round insn does not trap on denormals. */
32521 if (flag_trapping_math || !TARGET_ROUND)
32522 break;
32524 if (out_mode == SImode && in_mode == DFmode)
32526 if (out_n == 4 && in_n == 2)
32527 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX];
32528 else if (out_n == 8 && in_n == 4)
32529 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256];
32531 break;
32533 case BUILT_IN_IROUNDF:
32534 case BUILT_IN_LROUNDF:
32535 case BUILT_IN_LLROUNDF:
32536 /* The round insn does not trap on denormals. */
32537 if (flag_trapping_math || !TARGET_ROUND)
32538 break;
32540 if (out_mode == SImode && in_mode == SFmode)
32542 if (out_n == 4 && in_n == 4)
32543 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX];
32544 else if (out_n == 8 && in_n == 8)
32545 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX256];
32547 break;
32549 case BUILT_IN_COPYSIGN:
32550 if (out_mode == DFmode && in_mode == DFmode)
32552 if (out_n == 2 && in_n == 2)
32553 return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
32554 else if (out_n == 4 && in_n == 4)
32555 return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
32557 break;
32559 case BUILT_IN_COPYSIGNF:
32560 if (out_mode == SFmode && in_mode == SFmode)
32562 if (out_n == 4 && in_n == 4)
32563 return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
32564 else if (out_n == 8 && in_n == 8)
32565 return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
32567 break;
32569 case BUILT_IN_FLOOR:
32570 /* The round insn does not trap on denormals. */
32571 if (flag_trapping_math || !TARGET_ROUND)
32572 break;
32574 if (out_mode == DFmode && in_mode == DFmode)
32576 if (out_n == 2 && in_n == 2)
32577 return ix86_builtins[IX86_BUILTIN_FLOORPD];
32578 else if (out_n == 4 && in_n == 4)
32579 return ix86_builtins[IX86_BUILTIN_FLOORPD256];
32581 break;
32583 case BUILT_IN_FLOORF:
32584 /* The round insn does not trap on denormals. */
32585 if (flag_trapping_math || !TARGET_ROUND)
32586 break;
32588 if (out_mode == SFmode && in_mode == SFmode)
32590 if (out_n == 4 && in_n == 4)
32591 return ix86_builtins[IX86_BUILTIN_FLOORPS];
32592 else if (out_n == 8 && in_n == 8)
32593 return ix86_builtins[IX86_BUILTIN_FLOORPS256];
32595 break;
32597 case BUILT_IN_CEIL:
32598 /* The round insn does not trap on denormals. */
32599 if (flag_trapping_math || !TARGET_ROUND)
32600 break;
32602 if (out_mode == DFmode && in_mode == DFmode)
32604 if (out_n == 2 && in_n == 2)
32605 return ix86_builtins[IX86_BUILTIN_CEILPD];
32606 else if (out_n == 4 && in_n == 4)
32607 return ix86_builtins[IX86_BUILTIN_CEILPD256];
32609 break;
32611 case BUILT_IN_CEILF:
32612 /* The round insn does not trap on denormals. */
32613 if (flag_trapping_math || !TARGET_ROUND)
32614 break;
32616 if (out_mode == SFmode && in_mode == SFmode)
32618 if (out_n == 4 && in_n == 4)
32619 return ix86_builtins[IX86_BUILTIN_CEILPS];
32620 else if (out_n == 8 && in_n == 8)
32621 return ix86_builtins[IX86_BUILTIN_CEILPS256];
32623 break;
32625 case BUILT_IN_TRUNC:
32626 /* The round insn does not trap on denormals. */
32627 if (flag_trapping_math || !TARGET_ROUND)
32628 break;
32630 if (out_mode == DFmode && in_mode == DFmode)
32632 if (out_n == 2 && in_n == 2)
32633 return ix86_builtins[IX86_BUILTIN_TRUNCPD];
32634 else if (out_n == 4 && in_n == 4)
32635 return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
32637 break;
32639 case BUILT_IN_TRUNCF:
32640 /* The round insn does not trap on denormals. */
32641 if (flag_trapping_math || !TARGET_ROUND)
32642 break;
32644 if (out_mode == SFmode && in_mode == SFmode)
32646 if (out_n == 4 && in_n == 4)
32647 return ix86_builtins[IX86_BUILTIN_TRUNCPS];
32648 else if (out_n == 8 && in_n == 8)
32649 return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
32651 break;
32653 case BUILT_IN_RINT:
32654 /* The round insn does not trap on denormals. */
32655 if (flag_trapping_math || !TARGET_ROUND)
32656 break;
32658 if (out_mode == DFmode && in_mode == DFmode)
32660 if (out_n == 2 && in_n == 2)
32661 return ix86_builtins[IX86_BUILTIN_RINTPD];
32662 else if (out_n == 4 && in_n == 4)
32663 return ix86_builtins[IX86_BUILTIN_RINTPD256];
32665 break;
32667 case BUILT_IN_RINTF:
32668 /* The round insn does not trap on denormals. */
32669 if (flag_trapping_math || !TARGET_ROUND)
32670 break;
32672 if (out_mode == SFmode && in_mode == SFmode)
32674 if (out_n == 4 && in_n == 4)
32675 return ix86_builtins[IX86_BUILTIN_RINTPS];
32676 else if (out_n == 8 && in_n == 8)
32677 return ix86_builtins[IX86_BUILTIN_RINTPS256];
32679 break;
32681 case BUILT_IN_ROUND:
32682 /* The round insn does not trap on denormals. */
32683 if (flag_trapping_math || !TARGET_ROUND)
32684 break;
32686 if (out_mode == DFmode && in_mode == DFmode)
32688 if (out_n == 2 && in_n == 2)
32689 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ];
32690 else if (out_n == 4 && in_n == 4)
32691 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ256];
32693 break;
32695 case BUILT_IN_ROUNDF:
32696 /* The round insn does not trap on denormals. */
32697 if (flag_trapping_math || !TARGET_ROUND)
32698 break;
32700 if (out_mode == SFmode && in_mode == SFmode)
32702 if (out_n == 4 && in_n == 4)
32703 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ];
32704 else if (out_n == 8 && in_n == 8)
32705 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ256];
32707 break;
32709 case BUILT_IN_FMA:
32710 if (out_mode == DFmode && in_mode == DFmode)
32712 if (out_n == 2 && in_n == 2)
32713 return ix86_builtins[IX86_BUILTIN_VFMADDPD];
32714 if (out_n == 4 && in_n == 4)
32715 return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
32717 break;
32719 case BUILT_IN_FMAF:
32720 if (out_mode == SFmode && in_mode == SFmode)
32722 if (out_n == 4 && in_n == 4)
32723 return ix86_builtins[IX86_BUILTIN_VFMADDPS];
32724 if (out_n == 8 && in_n == 8)
32725 return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
32727 break;
32729 default:
32730 break;
32733 /* Dispatch to a handler for a vectorization library. */
32734 if (ix86_veclib_handler)
32735 return ix86_veclib_handler ((enum built_in_function) fn, type_out,
32736 type_in);
32738 return NULL_TREE;
32741 /* Handler for an SVML-style interface to
32742 a library with vectorized intrinsics. */
32744 static tree
32745 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
32747 char name[20];
32748 tree fntype, new_fndecl, args;
32749 unsigned arity;
32750 const char *bname;
32751 enum machine_mode el_mode, in_mode;
32752 int n, in_n;
32754 /* The SVML is suitable for unsafe math only. */
32755 if (!flag_unsafe_math_optimizations)
32756 return NULL_TREE;
32758 el_mode = TYPE_MODE (TREE_TYPE (type_out));
32759 n = TYPE_VECTOR_SUBPARTS (type_out);
32760 in_mode = TYPE_MODE (TREE_TYPE (type_in));
32761 in_n = TYPE_VECTOR_SUBPARTS (type_in);
32762 if (el_mode != in_mode
32763 || n != in_n)
32764 return NULL_TREE;
32766 switch (fn)
32768 case BUILT_IN_EXP:
32769 case BUILT_IN_LOG:
32770 case BUILT_IN_LOG10:
32771 case BUILT_IN_POW:
32772 case BUILT_IN_TANH:
32773 case BUILT_IN_TAN:
32774 case BUILT_IN_ATAN:
32775 case BUILT_IN_ATAN2:
32776 case BUILT_IN_ATANH:
32777 case BUILT_IN_CBRT:
32778 case BUILT_IN_SINH:
32779 case BUILT_IN_SIN:
32780 case BUILT_IN_ASINH:
32781 case BUILT_IN_ASIN:
32782 case BUILT_IN_COSH:
32783 case BUILT_IN_COS:
32784 case BUILT_IN_ACOSH:
32785 case BUILT_IN_ACOS:
32786 if (el_mode != DFmode || n != 2)
32787 return NULL_TREE;
32788 break;
32790 case BUILT_IN_EXPF:
32791 case BUILT_IN_LOGF:
32792 case BUILT_IN_LOG10F:
32793 case BUILT_IN_POWF:
32794 case BUILT_IN_TANHF:
32795 case BUILT_IN_TANF:
32796 case BUILT_IN_ATANF:
32797 case BUILT_IN_ATAN2F:
32798 case BUILT_IN_ATANHF:
32799 case BUILT_IN_CBRTF:
32800 case BUILT_IN_SINHF:
32801 case BUILT_IN_SINF:
32802 case BUILT_IN_ASINHF:
32803 case BUILT_IN_ASINF:
32804 case BUILT_IN_COSHF:
32805 case BUILT_IN_COSF:
32806 case BUILT_IN_ACOSHF:
32807 case BUILT_IN_ACOSF:
32808 if (el_mode != SFmode || n != 4)
32809 return NULL_TREE;
32810 break;
32812 default:
32813 return NULL_TREE;
32816 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
32818 if (fn == BUILT_IN_LOGF)
32819 strcpy (name, "vmlsLn4");
32820 else if (fn == BUILT_IN_LOG)
32821 strcpy (name, "vmldLn2");
32822 else if (n == 4)
32824 sprintf (name, "vmls%s", bname+10);
32825 name[strlen (name)-1] = '4';
32827 else
32828 sprintf (name, "vmld%s2", bname+10);
32830 /* Convert to uppercase. */
32831 name[4] &= ~0x20;
32833 arity = 0;
32834 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
32835 args;
32836 args = TREE_CHAIN (args))
32837 arity++;
32839 if (arity == 1)
32840 fntype = build_function_type_list (type_out, type_in, NULL);
32841 else
32842 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
32844 /* Build a function declaration for the vectorized function. */
32845 new_fndecl = build_decl (BUILTINS_LOCATION,
32846 FUNCTION_DECL, get_identifier (name), fntype);
32847 TREE_PUBLIC (new_fndecl) = 1;
32848 DECL_EXTERNAL (new_fndecl) = 1;
32849 DECL_IS_NOVOPS (new_fndecl) = 1;
32850 TREE_READONLY (new_fndecl) = 1;
32852 return new_fndecl;
32855 /* Handler for an ACML-style interface to
32856 a library with vectorized intrinsics. */
32858 static tree
32859 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
32861 char name[20] = "__vr.._";
32862 tree fntype, new_fndecl, args;
32863 unsigned arity;
32864 const char *bname;
32865 enum machine_mode el_mode, in_mode;
32866 int n, in_n;
32868 /* The ACML is 64bits only and suitable for unsafe math only as
32869 it does not correctly support parts of IEEE with the required
32870 precision such as denormals. */
32871 if (!TARGET_64BIT
32872 || !flag_unsafe_math_optimizations)
32873 return NULL_TREE;
32875 el_mode = TYPE_MODE (TREE_TYPE (type_out));
32876 n = TYPE_VECTOR_SUBPARTS (type_out);
32877 in_mode = TYPE_MODE (TREE_TYPE (type_in));
32878 in_n = TYPE_VECTOR_SUBPARTS (type_in);
32879 if (el_mode != in_mode
32880 || n != in_n)
32881 return NULL_TREE;
32883 switch (fn)
32885 case BUILT_IN_SIN:
32886 case BUILT_IN_COS:
32887 case BUILT_IN_EXP:
32888 case BUILT_IN_LOG:
32889 case BUILT_IN_LOG2:
32890 case BUILT_IN_LOG10:
32891 name[4] = 'd';
32892 name[5] = '2';
32893 if (el_mode != DFmode
32894 || n != 2)
32895 return NULL_TREE;
32896 break;
32898 case BUILT_IN_SINF:
32899 case BUILT_IN_COSF:
32900 case BUILT_IN_EXPF:
32901 case BUILT_IN_POWF:
32902 case BUILT_IN_LOGF:
32903 case BUILT_IN_LOG2F:
32904 case BUILT_IN_LOG10F:
32905 name[4] = 's';
32906 name[5] = '4';
32907 if (el_mode != SFmode
32908 || n != 4)
32909 return NULL_TREE;
32910 break;
32912 default:
32913 return NULL_TREE;
32916 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
32917 sprintf (name + 7, "%s", bname+10);
32919 arity = 0;
32920 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
32921 args;
32922 args = TREE_CHAIN (args))
32923 arity++;
32925 if (arity == 1)
32926 fntype = build_function_type_list (type_out, type_in, NULL);
32927 else
32928 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
32930 /* Build a function declaration for the vectorized function. */
32931 new_fndecl = build_decl (BUILTINS_LOCATION,
32932 FUNCTION_DECL, get_identifier (name), fntype);
32933 TREE_PUBLIC (new_fndecl) = 1;
32934 DECL_EXTERNAL (new_fndecl) = 1;
32935 DECL_IS_NOVOPS (new_fndecl) = 1;
32936 TREE_READONLY (new_fndecl) = 1;
32938 return new_fndecl;
32941 /* Returns a decl of a function that implements gather load with
32942 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
32943 Return NULL_TREE if it is not available. */
32945 static tree
32946 ix86_vectorize_builtin_gather (const_tree mem_vectype,
32947 const_tree index_type, int scale)
32949 bool si;
32950 enum ix86_builtins code;
32952 if (! TARGET_AVX2)
32953 return NULL_TREE;
32955 if ((TREE_CODE (index_type) != INTEGER_TYPE
32956 && !POINTER_TYPE_P (index_type))
32957 || (TYPE_MODE (index_type) != SImode
32958 && TYPE_MODE (index_type) != DImode))
32959 return NULL_TREE;
32961 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
32962 return NULL_TREE;
32964 /* v*gather* insn sign extends index to pointer mode. */
32965 if (TYPE_PRECISION (index_type) < POINTER_SIZE
32966 && TYPE_UNSIGNED (index_type))
32967 return NULL_TREE;
32969 if (scale <= 0
32970 || scale > 8
32971 || (scale & (scale - 1)) != 0)
32972 return NULL_TREE;
32974 si = TYPE_MODE (index_type) == SImode;
32975 switch (TYPE_MODE (mem_vectype))
32977 case V2DFmode:
32978 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
32979 break;
32980 case V4DFmode:
32981 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
32982 break;
32983 case V2DImode:
32984 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
32985 break;
32986 case V4DImode:
32987 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
32988 break;
32989 case V4SFmode:
32990 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
32991 break;
32992 case V8SFmode:
32993 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
32994 break;
32995 case V4SImode:
32996 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
32997 break;
32998 case V8SImode:
32999 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
33000 break;
33001 default:
33002 return NULL_TREE;
33005 return ix86_builtins[code];
33008 /* Returns a code for a target-specific builtin that implements
33009 reciprocal of the function, or NULL_TREE if not available. */
33011 static tree
33012 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
33013 bool sqrt ATTRIBUTE_UNUSED)
33015 if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
33016 && flag_finite_math_only && !flag_trapping_math
33017 && flag_unsafe_math_optimizations))
33018 return NULL_TREE;
33020 if (md_fn)
33021 /* Machine dependent builtins. */
33022 switch (fn)
33024 /* Vectorized version of sqrt to rsqrt conversion. */
33025 case IX86_BUILTIN_SQRTPS_NR:
33026 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
33028 case IX86_BUILTIN_SQRTPS_NR256:
33029 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
33031 default:
33032 return NULL_TREE;
33034 else
33035 /* Normal builtins. */
33036 switch (fn)
33038 /* Sqrt to rsqrt conversion. */
33039 case BUILT_IN_SQRTF:
33040 return ix86_builtins[IX86_BUILTIN_RSQRTF];
33042 default:
33043 return NULL_TREE;
33047 /* Helper for avx_vpermilps256_operand et al. This is also used by
33048 the expansion functions to turn the parallel back into a mask.
33049 The return value is 0 for no match and the imm8+1 for a match. */
33052 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
33054 unsigned i, nelt = GET_MODE_NUNITS (mode);
33055 unsigned mask = 0;
33056 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
33058 if (XVECLEN (par, 0) != (int) nelt)
33059 return 0;
33061 /* Validate that all of the elements are constants, and not totally
33062 out of range. Copy the data into an integral array to make the
33063 subsequent checks easier. */
33064 for (i = 0; i < nelt; ++i)
33066 rtx er = XVECEXP (par, 0, i);
33067 unsigned HOST_WIDE_INT ei;
33069 if (!CONST_INT_P (er))
33070 return 0;
33071 ei = INTVAL (er);
33072 if (ei >= nelt)
33073 return 0;
33074 ipar[i] = ei;
33077 switch (mode)
33079 case V4DFmode:
33080 /* In the 256-bit DFmode case, we can only move elements within
33081 a 128-bit lane. */
33082 for (i = 0; i < 2; ++i)
33084 if (ipar[i] >= 2)
33085 return 0;
33086 mask |= ipar[i] << i;
33088 for (i = 2; i < 4; ++i)
33090 if (ipar[i] < 2)
33091 return 0;
33092 mask |= (ipar[i] - 2) << i;
33094 break;
33096 case V8SFmode:
33097 /* In the 256-bit SFmode case, we have full freedom of movement
33098 within the low 128-bit lane, but the high 128-bit lane must
33099 mirror the exact same pattern. */
33100 for (i = 0; i < 4; ++i)
33101 if (ipar[i] + 4 != ipar[i + 4])
33102 return 0;
33103 nelt = 4;
33104 /* FALLTHRU */
33106 case V2DFmode:
33107 case V4SFmode:
33108 /* In the 128-bit case, we've full freedom in the placement of
33109 the elements from the source operand. */
33110 for (i = 0; i < nelt; ++i)
33111 mask |= ipar[i] << (i * (nelt / 2));
33112 break;
33114 default:
33115 gcc_unreachable ();
33118 /* Make sure success has a non-zero value by adding one. */
33119 return mask + 1;
33122 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
33123 the expansion functions to turn the parallel back into a mask.
33124 The return value is 0 for no match and the imm8+1 for a match. */
33127 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
33129 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
33130 unsigned mask = 0;
33131 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
33133 if (XVECLEN (par, 0) != (int) nelt)
33134 return 0;
33136 /* Validate that all of the elements are constants, and not totally
33137 out of range. Copy the data into an integral array to make the
33138 subsequent checks easier. */
33139 for (i = 0; i < nelt; ++i)
33141 rtx er = XVECEXP (par, 0, i);
33142 unsigned HOST_WIDE_INT ei;
33144 if (!CONST_INT_P (er))
33145 return 0;
33146 ei = INTVAL (er);
33147 if (ei >= 2 * nelt)
33148 return 0;
33149 ipar[i] = ei;
33152 /* Validate that the halves of the permute are halves. */
33153 for (i = 0; i < nelt2 - 1; ++i)
33154 if (ipar[i] + 1 != ipar[i + 1])
33155 return 0;
33156 for (i = nelt2; i < nelt - 1; ++i)
33157 if (ipar[i] + 1 != ipar[i + 1])
33158 return 0;
33160 /* Reconstruct the mask. */
33161 for (i = 0; i < 2; ++i)
33163 unsigned e = ipar[i * nelt2];
33164 if (e % nelt2)
33165 return 0;
33166 e /= nelt2;
33167 mask |= e << (i * 4);
33170 /* Make sure success has a non-zero value by adding one. */
33171 return mask + 1;
33174 /* Store OPERAND to the memory after reload is completed. This means
33175 that we can't easily use assign_stack_local. */
33177 ix86_force_to_memory (enum machine_mode mode, rtx operand)
33179 rtx result;
33181 gcc_assert (reload_completed);
33182 if (ix86_using_red_zone ())
33184 result = gen_rtx_MEM (mode,
33185 gen_rtx_PLUS (Pmode,
33186 stack_pointer_rtx,
33187 GEN_INT (-RED_ZONE_SIZE)));
33188 emit_move_insn (result, operand);
33190 else if (TARGET_64BIT)
33192 switch (mode)
33194 case HImode:
33195 case SImode:
33196 operand = gen_lowpart (DImode, operand);
33197 /* FALLTHRU */
33198 case DImode:
33199 emit_insn (
33200 gen_rtx_SET (VOIDmode,
33201 gen_rtx_MEM (DImode,
33202 gen_rtx_PRE_DEC (DImode,
33203 stack_pointer_rtx)),
33204 operand));
33205 break;
33206 default:
33207 gcc_unreachable ();
33209 result = gen_rtx_MEM (mode, stack_pointer_rtx);
33211 else
33213 switch (mode)
33215 case DImode:
33217 rtx operands[2];
33218 split_double_mode (mode, &operand, 1, operands, operands + 1);
33219 emit_insn (
33220 gen_rtx_SET (VOIDmode,
33221 gen_rtx_MEM (SImode,
33222 gen_rtx_PRE_DEC (Pmode,
33223 stack_pointer_rtx)),
33224 operands[1]));
33225 emit_insn (
33226 gen_rtx_SET (VOIDmode,
33227 gen_rtx_MEM (SImode,
33228 gen_rtx_PRE_DEC (Pmode,
33229 stack_pointer_rtx)),
33230 operands[0]));
33232 break;
33233 case HImode:
33234 /* Store HImodes as SImodes. */
33235 operand = gen_lowpart (SImode, operand);
33236 /* FALLTHRU */
33237 case SImode:
33238 emit_insn (
33239 gen_rtx_SET (VOIDmode,
33240 gen_rtx_MEM (GET_MODE (operand),
33241 gen_rtx_PRE_DEC (SImode,
33242 stack_pointer_rtx)),
33243 operand));
33244 break;
33245 default:
33246 gcc_unreachable ();
33248 result = gen_rtx_MEM (mode, stack_pointer_rtx);
33250 return result;
33253 /* Free operand from the memory. */
33254 void
33255 ix86_free_from_memory (enum machine_mode mode)
33257 if (!ix86_using_red_zone ())
33259 int size;
33261 if (mode == DImode || TARGET_64BIT)
33262 size = 8;
33263 else
33264 size = 4;
33265 /* Use LEA to deallocate stack space. In peephole2 it will be converted
33266 to pop or add instruction if registers are available. */
33267 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
33268 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
33269 GEN_INT (size))));
33273 /* Return a register priority for hard reg REGNO. */
33274 static int
33275 ix86_register_priority (int hard_regno)
33277 /* ebp and r13 as the base always wants a displacement, r12 as the
33278 base always wants an index. So discourage their usage in an
33279 address. */
33280 if (hard_regno == R12_REG || hard_regno == R13_REG)
33281 return 0;
33282 if (hard_regno == BP_REG)
33283 return 1;
33284 /* New x86-64 int registers result in bigger code size. Discourage
33285 them. */
33286 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
33287 return 2;
33288 /* New x86-64 SSE registers result in bigger code size. Discourage
33289 them. */
33290 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
33291 return 2;
33292 /* Usage of AX register results in smaller code. Prefer it. */
33293 if (hard_regno == 0)
33294 return 4;
33295 return 3;
33298 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
33300 Put float CONST_DOUBLE in the constant pool instead of fp regs.
33301 QImode must go into class Q_REGS.
33302 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
33303 movdf to do mem-to-mem moves through integer regs. */
33305 static reg_class_t
33306 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
33308 enum machine_mode mode = GET_MODE (x);
33310 /* We're only allowed to return a subclass of CLASS. Many of the
33311 following checks fail for NO_REGS, so eliminate that early. */
33312 if (regclass == NO_REGS)
33313 return NO_REGS;
33315 /* All classes can load zeros. */
33316 if (x == CONST0_RTX (mode))
33317 return regclass;
33319 /* Force constants into memory if we are loading a (nonzero) constant into
33320 an MMX or SSE register. This is because there are no MMX/SSE instructions
33321 to load from a constant. */
33322 if (CONSTANT_P (x)
33323 && (MAYBE_MMX_CLASS_P (regclass) || MAYBE_SSE_CLASS_P (regclass)))
33324 return NO_REGS;
33326 /* Prefer SSE regs only, if we can use them for math. */
33327 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
33328 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
33330 /* Floating-point constants need more complex checks. */
33331 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
33333 /* General regs can load everything. */
33334 if (reg_class_subset_p (regclass, GENERAL_REGS))
33335 return regclass;
33337 /* Floats can load 0 and 1 plus some others. Note that we eliminated
33338 zero above. We only want to wind up preferring 80387 registers if
33339 we plan on doing computation with them. */
33340 if (TARGET_80387
33341 && standard_80387_constant_p (x) > 0)
33343 /* Limit class to non-sse. */
33344 if (regclass == FLOAT_SSE_REGS)
33345 return FLOAT_REGS;
33346 if (regclass == FP_TOP_SSE_REGS)
33347 return FP_TOP_REG;
33348 if (regclass == FP_SECOND_SSE_REGS)
33349 return FP_SECOND_REG;
33350 if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
33351 return regclass;
33354 return NO_REGS;
33357 /* Generally when we see PLUS here, it's the function invariant
33358 (plus soft-fp const_int). Which can only be computed into general
33359 regs. */
33360 if (GET_CODE (x) == PLUS)
33361 return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
33363 /* QImode constants are easy to load, but non-constant QImode data
33364 must go into Q_REGS. */
33365 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
33367 if (reg_class_subset_p (regclass, Q_REGS))
33368 return regclass;
33369 if (reg_class_subset_p (Q_REGS, regclass))
33370 return Q_REGS;
33371 return NO_REGS;
33374 return regclass;
33377 /* Discourage putting floating-point values in SSE registers unless
33378 SSE math is being used, and likewise for the 387 registers. */
33379 static reg_class_t
33380 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
33382 enum machine_mode mode = GET_MODE (x);
33384 /* Restrict the output reload class to the register bank that we are doing
33385 math on. If we would like not to return a subset of CLASS, reject this
33386 alternative: if reload cannot do this, it will still use its choice. */
33387 mode = GET_MODE (x);
33388 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
33389 return MAYBE_SSE_CLASS_P (regclass) ? SSE_REGS : NO_REGS;
33391 if (X87_FLOAT_MODE_P (mode))
33393 if (regclass == FP_TOP_SSE_REGS)
33394 return FP_TOP_REG;
33395 else if (regclass == FP_SECOND_SSE_REGS)
33396 return FP_SECOND_REG;
33397 else
33398 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
33401 return regclass;
33404 static reg_class_t
33405 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
33406 enum machine_mode mode, secondary_reload_info *sri)
33408 /* Double-word spills from general registers to non-offsettable memory
33409 references (zero-extended addresses) require special handling. */
33410 if (TARGET_64BIT
33411 && MEM_P (x)
33412 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
33413 && rclass == GENERAL_REGS
33414 && !offsettable_memref_p (x))
33416 sri->icode = (in_p
33417 ? CODE_FOR_reload_noff_load
33418 : CODE_FOR_reload_noff_store);
33419 /* Add the cost of moving address to a temporary. */
33420 sri->extra_cost = 1;
33422 return NO_REGS;
33425 /* QImode spills from non-QI registers require
33426 intermediate register on 32bit targets. */
33427 if (!TARGET_64BIT
33428 && !in_p && mode == QImode
33429 && (rclass == GENERAL_REGS
33430 || rclass == LEGACY_REGS
33431 || rclass == NON_Q_REGS
33432 || rclass == SIREG
33433 || rclass == DIREG
33434 || rclass == INDEX_REGS))
33436 int regno;
33438 if (REG_P (x))
33439 regno = REGNO (x);
33440 else
33441 regno = -1;
33443 if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
33444 regno = true_regnum (x);
33446 /* Return Q_REGS if the operand is in memory. */
33447 if (regno == -1)
33448 return Q_REGS;
33451 /* This condition handles corner case where an expression involving
33452 pointers gets vectorized. We're trying to use the address of a
33453 stack slot as a vector initializer.
33455 (set (reg:V2DI 74 [ vect_cst_.2 ])
33456 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
33458 Eventually frame gets turned into sp+offset like this:
33460 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33461 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
33462 (const_int 392 [0x188]))))
33464 That later gets turned into:
33466 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33467 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
33468 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
33470 We'll have the following reload recorded:
33472 Reload 0: reload_in (DI) =
33473 (plus:DI (reg/f:DI 7 sp)
33474 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
33475 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33476 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
33477 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
33478 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33479 reload_reg_rtx: (reg:V2DI 22 xmm1)
33481 Which isn't going to work since SSE instructions can't handle scalar
33482 additions. Returning GENERAL_REGS forces the addition into integer
33483 register and reload can handle subsequent reloads without problems. */
33485 if (in_p && GET_CODE (x) == PLUS
33486 && SSE_CLASS_P (rclass)
33487 && SCALAR_INT_MODE_P (mode))
33488 return GENERAL_REGS;
33490 return NO_REGS;
33493 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
33495 static bool
33496 ix86_class_likely_spilled_p (reg_class_t rclass)
33498 switch (rclass)
33500 case AREG:
33501 case DREG:
33502 case CREG:
33503 case BREG:
33504 case AD_REGS:
33505 case SIREG:
33506 case DIREG:
33507 case SSE_FIRST_REG:
33508 case FP_TOP_REG:
33509 case FP_SECOND_REG:
33510 return true;
33512 default:
33513 break;
33516 return false;
33519 /* If we are copying between general and FP registers, we need a memory
33520 location. The same is true for SSE and MMX registers.
33522 To optimize register_move_cost performance, allow inline variant.
33524 The macro can't work reliably when one of the CLASSES is class containing
33525 registers from multiple units (SSE, MMX, integer). We avoid this by never
33526 combining those units in single alternative in the machine description.
33527 Ensure that this constraint holds to avoid unexpected surprises.
33529 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
33530 enforce these sanity checks. */
33532 static inline bool
33533 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
33534 enum machine_mode mode, int strict)
33536 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
33537 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
33538 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
33539 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
33540 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
33541 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
33543 gcc_assert (!strict || lra_in_progress);
33544 return true;
33547 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
33548 return true;
33550 /* ??? This is a lie. We do have moves between mmx/general, and for
33551 mmx/sse2. But by saying we need secondary memory we discourage the
33552 register allocator from using the mmx registers unless needed. */
33553 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
33554 return true;
33556 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
33558 /* SSE1 doesn't have any direct moves from other classes. */
33559 if (!TARGET_SSE2)
33560 return true;
33562 /* If the target says that inter-unit moves are more expensive
33563 than moving through memory, then don't generate them. */
33564 if (!TARGET_INTER_UNIT_MOVES)
33565 return true;
33567 /* Between SSE and general, we have moves no larger than word size. */
33568 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
33569 return true;
33572 return false;
33575 bool
33576 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
33577 enum machine_mode mode, int strict)
33579 return inline_secondary_memory_needed (class1, class2, mode, strict);
33582 /* Implement the TARGET_CLASS_MAX_NREGS hook.
33584 On the 80386, this is the size of MODE in words,
33585 except in the FP regs, where a single reg is always enough. */
33587 static unsigned char
33588 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
33590 if (MAYBE_INTEGER_CLASS_P (rclass))
33592 if (mode == XFmode)
33593 return (TARGET_64BIT ? 2 : 3);
33594 else if (mode == XCmode)
33595 return (TARGET_64BIT ? 4 : 6);
33596 else
33597 return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
33599 else
33601 if (COMPLEX_MODE_P (mode))
33602 return 2;
33603 else
33604 return 1;
33608 /* Return true if the registers in CLASS cannot represent the change from
33609 modes FROM to TO. */
33611 bool
33612 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
33613 enum reg_class regclass)
33615 if (from == to)
33616 return false;
33618 /* x87 registers can't do subreg at all, as all values are reformatted
33619 to extended precision. */
33620 if (MAYBE_FLOAT_CLASS_P (regclass))
33621 return true;
33623 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
33625 /* Vector registers do not support QI or HImode loads. If we don't
33626 disallow a change to these modes, reload will assume it's ok to
33627 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
33628 the vec_dupv4hi pattern. */
33629 if (GET_MODE_SIZE (from) < 4)
33630 return true;
33632 /* Vector registers do not support subreg with nonzero offsets, which
33633 are otherwise valid for integer registers. Since we can't see
33634 whether we have a nonzero offset from here, prohibit all
33635 nonparadoxical subregs changing size. */
33636 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
33637 return true;
33640 return false;
33643 /* Return the cost of moving data of mode M between a
33644 register and memory. A value of 2 is the default; this cost is
33645 relative to those in `REGISTER_MOVE_COST'.
33647 This function is used extensively by register_move_cost that is used to
33648 build tables at startup. Make it inline in this case.
33649 When IN is 2, return maximum of in and out move cost.
33651 If moving between registers and memory is more expensive than
33652 between two registers, you should define this macro to express the
33653 relative cost.
33655 Model also increased moving costs of QImode registers in non
33656 Q_REGS classes.
33658 static inline int
33659 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
33660 int in)
33662 int cost;
33663 if (FLOAT_CLASS_P (regclass))
33665 int index;
33666 switch (mode)
33668 case SFmode:
33669 index = 0;
33670 break;
33671 case DFmode:
33672 index = 1;
33673 break;
33674 case XFmode:
33675 index = 2;
33676 break;
33677 default:
33678 return 100;
33680 if (in == 2)
33681 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
33682 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
33684 if (SSE_CLASS_P (regclass))
33686 int index;
33687 switch (GET_MODE_SIZE (mode))
33689 case 4:
33690 index = 0;
33691 break;
33692 case 8:
33693 index = 1;
33694 break;
33695 case 16:
33696 index = 2;
33697 break;
33698 default:
33699 return 100;
33701 if (in == 2)
33702 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
33703 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
33705 if (MMX_CLASS_P (regclass))
33707 int index;
33708 switch (GET_MODE_SIZE (mode))
33710 case 4:
33711 index = 0;
33712 break;
33713 case 8:
33714 index = 1;
33715 break;
33716 default:
33717 return 100;
33719 if (in)
33720 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
33721 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
33723 switch (GET_MODE_SIZE (mode))
33725 case 1:
33726 if (Q_CLASS_P (regclass) || TARGET_64BIT)
33728 if (!in)
33729 return ix86_cost->int_store[0];
33730 if (TARGET_PARTIAL_REG_DEPENDENCY
33731 && optimize_function_for_speed_p (cfun))
33732 cost = ix86_cost->movzbl_load;
33733 else
33734 cost = ix86_cost->int_load[0];
33735 if (in == 2)
33736 return MAX (cost, ix86_cost->int_store[0]);
33737 return cost;
33739 else
33741 if (in == 2)
33742 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
33743 if (in)
33744 return ix86_cost->movzbl_load;
33745 else
33746 return ix86_cost->int_store[0] + 4;
33748 break;
33749 case 2:
33750 if (in == 2)
33751 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
33752 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
33753 default:
33754 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
33755 if (mode == TFmode)
33756 mode = XFmode;
33757 if (in == 2)
33758 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
33759 else if (in)
33760 cost = ix86_cost->int_load[2];
33761 else
33762 cost = ix86_cost->int_store[2];
33763 return (cost * (((int) GET_MODE_SIZE (mode)
33764 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
33768 static int
33769 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
33770 bool in)
33772 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
33776 /* Return the cost of moving data from a register in class CLASS1 to
33777 one in class CLASS2.
33779 It is not required that the cost always equal 2 when FROM is the same as TO;
33780 on some machines it is expensive to move between registers if they are not
33781 general registers. */
33783 static int
33784 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
33785 reg_class_t class2_i)
33787 enum reg_class class1 = (enum reg_class) class1_i;
33788 enum reg_class class2 = (enum reg_class) class2_i;
33790 /* In case we require secondary memory, compute cost of the store followed
33791 by load. In order to avoid bad register allocation choices, we need
33792 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
33794 if (inline_secondary_memory_needed (class1, class2, mode, 0))
33796 int cost = 1;
33798 cost += inline_memory_move_cost (mode, class1, 2);
33799 cost += inline_memory_move_cost (mode, class2, 2);
33801 /* In case of copying from general_purpose_register we may emit multiple
33802 stores followed by single load causing memory size mismatch stall.
33803 Count this as arbitrarily high cost of 20. */
33804 if (targetm.class_max_nregs (class1, mode)
33805 > targetm.class_max_nregs (class2, mode))
33806 cost += 20;
33808 /* In the case of FP/MMX moves, the registers actually overlap, and we
33809 have to switch modes in order to treat them differently. */
33810 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
33811 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
33812 cost += 20;
33814 return cost;
33817 /* Moves between SSE/MMX and integer unit are expensive. */
33818 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
33819 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
33821 /* ??? By keeping returned value relatively high, we limit the number
33822 of moves between integer and MMX/SSE registers for all targets.
33823 Additionally, high value prevents problem with x86_modes_tieable_p(),
33824 where integer modes in MMX/SSE registers are not tieable
33825 because of missing QImode and HImode moves to, from or between
33826 MMX/SSE registers. */
33827 return MAX (8, ix86_cost->mmxsse_to_integer);
33829 if (MAYBE_FLOAT_CLASS_P (class1))
33830 return ix86_cost->fp_move;
33831 if (MAYBE_SSE_CLASS_P (class1))
33832 return ix86_cost->sse_move;
33833 if (MAYBE_MMX_CLASS_P (class1))
33834 return ix86_cost->mmx_move;
33835 return 2;
33838 /* Return TRUE if hard register REGNO can hold a value of machine-mode
33839 MODE. */
33841 bool
33842 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
33844 /* Flags and only flags can only hold CCmode values. */
33845 if (CC_REGNO_P (regno))
33846 return GET_MODE_CLASS (mode) == MODE_CC;
33847 if (GET_MODE_CLASS (mode) == MODE_CC
33848 || GET_MODE_CLASS (mode) == MODE_RANDOM
33849 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
33850 return false;
33851 if (STACK_REGNO_P (regno))
33852 return VALID_FP_MODE_P (mode);
33853 if (SSE_REGNO_P (regno))
33855 /* We implement the move patterns for all vector modes into and
33856 out of SSE registers, even when no operation instructions
33857 are available. OImode move is available only when AVX is
33858 enabled. */
33859 return ((TARGET_AVX && mode == OImode)
33860 || VALID_AVX256_REG_MODE (mode)
33861 || VALID_SSE_REG_MODE (mode)
33862 || VALID_SSE2_REG_MODE (mode)
33863 || VALID_MMX_REG_MODE (mode)
33864 || VALID_MMX_REG_MODE_3DNOW (mode));
33866 if (MMX_REGNO_P (regno))
33868 /* We implement the move patterns for 3DNOW modes even in MMX mode,
33869 so if the register is available at all, then we can move data of
33870 the given mode into or out of it. */
33871 return (VALID_MMX_REG_MODE (mode)
33872 || VALID_MMX_REG_MODE_3DNOW (mode));
33875 if (mode == QImode)
33877 /* Take care for QImode values - they can be in non-QI regs,
33878 but then they do cause partial register stalls. */
33879 if (TARGET_64BIT || QI_REGNO_P (regno))
33880 return true;
33881 if (!TARGET_PARTIAL_REG_STALL)
33882 return true;
33883 return !can_create_pseudo_p ();
33885 /* We handle both integer and floats in the general purpose registers. */
33886 else if (VALID_INT_MODE_P (mode))
33887 return true;
33888 else if (VALID_FP_MODE_P (mode))
33889 return true;
33890 else if (VALID_DFP_MODE_P (mode))
33891 return true;
33892 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
33893 on to use that value in smaller contexts, this can easily force a
33894 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
33895 supporting DImode, allow it. */
33896 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
33897 return true;
33899 return false;
33902 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
33903 tieable integer mode. */
33905 static bool
33906 ix86_tieable_integer_mode_p (enum machine_mode mode)
33908 switch (mode)
33910 case HImode:
33911 case SImode:
33912 return true;
33914 case QImode:
33915 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
33917 case DImode:
33918 return TARGET_64BIT;
33920 default:
33921 return false;
33925 /* Return true if MODE1 is accessible in a register that can hold MODE2
33926 without copying. That is, all register classes that can hold MODE2
33927 can also hold MODE1. */
33929 bool
33930 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
33932 if (mode1 == mode2)
33933 return true;
33935 if (ix86_tieable_integer_mode_p (mode1)
33936 && ix86_tieable_integer_mode_p (mode2))
33937 return true;
33939 /* MODE2 being XFmode implies fp stack or general regs, which means we
33940 can tie any smaller floating point modes to it. Note that we do not
33941 tie this with TFmode. */
33942 if (mode2 == XFmode)
33943 return mode1 == SFmode || mode1 == DFmode;
33945 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
33946 that we can tie it with SFmode. */
33947 if (mode2 == DFmode)
33948 return mode1 == SFmode;
33950 /* If MODE2 is only appropriate for an SSE register, then tie with
33951 any other mode acceptable to SSE registers. */
33952 if (GET_MODE_SIZE (mode2) == 32
33953 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
33954 return (GET_MODE_SIZE (mode1) == 32
33955 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
33956 if (GET_MODE_SIZE (mode2) == 16
33957 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
33958 return (GET_MODE_SIZE (mode1) == 16
33959 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
33961 /* If MODE2 is appropriate for an MMX register, then tie
33962 with any other mode acceptable to MMX registers. */
33963 if (GET_MODE_SIZE (mode2) == 8
33964 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
33965 return (GET_MODE_SIZE (mode1) == 8
33966 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
33968 return false;
33971 /* Return the cost of moving between two registers of mode MODE. */
33973 static int
33974 ix86_set_reg_reg_cost (enum machine_mode mode)
33976 unsigned int units = UNITS_PER_WORD;
33978 switch (GET_MODE_CLASS (mode))
33980 default:
33981 break;
33983 case MODE_CC:
33984 units = GET_MODE_SIZE (CCmode);
33985 break;
33987 case MODE_FLOAT:
33988 if ((TARGET_SSE && mode == TFmode)
33989 || (TARGET_80387 && mode == XFmode)
33990 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
33991 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
33992 units = GET_MODE_SIZE (mode);
33993 break;
33995 case MODE_COMPLEX_FLOAT:
33996 if ((TARGET_SSE && mode == TCmode)
33997 || (TARGET_80387 && mode == XCmode)
33998 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
33999 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
34000 units = GET_MODE_SIZE (mode);
34001 break;
34003 case MODE_VECTOR_INT:
34004 case MODE_VECTOR_FLOAT:
34005 if ((TARGET_AVX && VALID_AVX256_REG_MODE (mode))
34006 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
34007 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
34008 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
34009 units = GET_MODE_SIZE (mode);
34012 /* Return the cost of moving between two registers of mode MODE,
34013 assuming that the move will be in pieces of at most UNITS bytes. */
34014 return COSTS_N_INSNS ((GET_MODE_SIZE (mode) + units - 1) / units);
34017 /* Compute a (partial) cost for rtx X. Return true if the complete
34018 cost has been computed, and false if subexpressions should be
34019 scanned. In either case, *TOTAL contains the cost result. */
34021 static bool
34022 ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
34023 bool speed)
34025 enum rtx_code code = (enum rtx_code) code_i;
34026 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
34027 enum machine_mode mode = GET_MODE (x);
34028 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
34030 switch (code)
34032 case SET:
34033 if (register_operand (SET_DEST (x), VOIDmode)
34034 && reg_or_0_operand (SET_SRC (x), VOIDmode))
34036 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
34037 return true;
34039 return false;
34041 case CONST_INT:
34042 case CONST:
34043 case LABEL_REF:
34044 case SYMBOL_REF:
34045 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
34046 *total = 3;
34047 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
34048 *total = 2;
34049 else if (flag_pic && SYMBOLIC_CONST (x)
34050 && (!TARGET_64BIT
34051 || (!GET_CODE (x) != LABEL_REF
34052 && (GET_CODE (x) != SYMBOL_REF
34053 || !SYMBOL_REF_LOCAL_P (x)))))
34054 *total = 1;
34055 else
34056 *total = 0;
34057 return true;
34059 case CONST_DOUBLE:
34060 if (mode == VOIDmode)
34062 *total = 0;
34063 return true;
34065 switch (standard_80387_constant_p (x))
34067 case 1: /* 0.0 */
34068 *total = 1;
34069 return true;
34070 default: /* Other constants */
34071 *total = 2;
34072 return true;
34073 case 0:
34074 case -1:
34075 break;
34077 if (SSE_FLOAT_MODE_P (mode))
34079 case CONST_VECTOR:
34080 switch (standard_sse_constant_p (x))
34082 case 0:
34083 break;
34084 case 1: /* 0: xor eliminates false dependency */
34085 *total = 0;
34086 return true;
34087 default: /* -1: cmp contains false dependency */
34088 *total = 1;
34089 return true;
34092 /* Fall back to (MEM (SYMBOL_REF)), since that's where
34093 it'll probably end up. Add a penalty for size. */
34094 *total = (COSTS_N_INSNS (1)
34095 + (flag_pic != 0 && !TARGET_64BIT)
34096 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
34097 return true;
34099 case ZERO_EXTEND:
34100 /* The zero extensions is often completely free on x86_64, so make
34101 it as cheap as possible. */
34102 if (TARGET_64BIT && mode == DImode
34103 && GET_MODE (XEXP (x, 0)) == SImode)
34104 *total = 1;
34105 else if (TARGET_ZERO_EXTEND_WITH_AND)
34106 *total = cost->add;
34107 else
34108 *total = cost->movzx;
34109 return false;
34111 case SIGN_EXTEND:
34112 *total = cost->movsx;
34113 return false;
34115 case ASHIFT:
34116 if (SCALAR_INT_MODE_P (mode)
34117 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
34118 && CONST_INT_P (XEXP (x, 1)))
34120 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
34121 if (value == 1)
34123 *total = cost->add;
34124 return false;
34126 if ((value == 2 || value == 3)
34127 && cost->lea <= cost->shift_const)
34129 *total = cost->lea;
34130 return false;
34133 /* FALLTHRU */
34135 case ROTATE:
34136 case ASHIFTRT:
34137 case LSHIFTRT:
34138 case ROTATERT:
34139 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
34141 /* ??? Should be SSE vector operation cost. */
34142 /* At least for published AMD latencies, this really is the same
34143 as the latency for a simple fpu operation like fabs. */
34144 /* V*QImode is emulated with 1-11 insns. */
34145 if (mode == V16QImode || mode == V32QImode)
34147 int count = 11;
34148 if (TARGET_XOP && mode == V16QImode)
34150 /* For XOP we use vpshab, which requires a broadcast of the
34151 value to the variable shift insn. For constants this
34152 means a V16Q const in mem; even when we can perform the
34153 shift with one insn set the cost to prefer paddb. */
34154 if (CONSTANT_P (XEXP (x, 1)))
34156 *total = (cost->fabs
34157 + rtx_cost (XEXP (x, 0), code, 0, speed)
34158 + (speed ? 2 : COSTS_N_BYTES (16)));
34159 return true;
34161 count = 3;
34163 else if (TARGET_SSSE3)
34164 count = 7;
34165 *total = cost->fabs * count;
34167 else
34168 *total = cost->fabs;
34170 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34172 if (CONST_INT_P (XEXP (x, 1)))
34174 if (INTVAL (XEXP (x, 1)) > 32)
34175 *total = cost->shift_const + COSTS_N_INSNS (2);
34176 else
34177 *total = cost->shift_const * 2;
34179 else
34181 if (GET_CODE (XEXP (x, 1)) == AND)
34182 *total = cost->shift_var * 2;
34183 else
34184 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
34187 else
34189 if (CONST_INT_P (XEXP (x, 1)))
34190 *total = cost->shift_const;
34191 else
34192 *total = cost->shift_var;
34194 return false;
34196 case FMA:
34198 rtx sub;
34200 gcc_assert (FLOAT_MODE_P (mode));
34201 gcc_assert (TARGET_FMA || TARGET_FMA4);
34203 /* ??? SSE scalar/vector cost should be used here. */
34204 /* ??? Bald assumption that fma has the same cost as fmul. */
34205 *total = cost->fmul;
34206 *total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
34208 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
34209 sub = XEXP (x, 0);
34210 if (GET_CODE (sub) == NEG)
34211 sub = XEXP (sub, 0);
34212 *total += rtx_cost (sub, FMA, 0, speed);
34214 sub = XEXP (x, 2);
34215 if (GET_CODE (sub) == NEG)
34216 sub = XEXP (sub, 0);
34217 *total += rtx_cost (sub, FMA, 2, speed);
34218 return true;
34221 case MULT:
34222 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34224 /* ??? SSE scalar cost should be used here. */
34225 *total = cost->fmul;
34226 return false;
34228 else if (X87_FLOAT_MODE_P (mode))
34230 *total = cost->fmul;
34231 return false;
34233 else if (FLOAT_MODE_P (mode))
34235 /* ??? SSE vector cost should be used here. */
34236 *total = cost->fmul;
34237 return false;
34239 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
34241 /* V*QImode is emulated with 7-13 insns. */
34242 if (mode == V16QImode || mode == V32QImode)
34244 int extra = 11;
34245 if (TARGET_XOP && mode == V16QImode)
34246 extra = 5;
34247 else if (TARGET_SSSE3)
34248 extra = 6;
34249 *total = cost->fmul * 2 + cost->fabs * extra;
34251 /* V*DImode is emulated with 5-8 insns. */
34252 else if (mode == V2DImode || mode == V4DImode)
34254 if (TARGET_XOP && mode == V2DImode)
34255 *total = cost->fmul * 2 + cost->fabs * 3;
34256 else
34257 *total = cost->fmul * 3 + cost->fabs * 5;
34259 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
34260 insns, including two PMULUDQ. */
34261 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
34262 *total = cost->fmul * 2 + cost->fabs * 5;
34263 else
34264 *total = cost->fmul;
34265 return false;
34267 else
34269 rtx op0 = XEXP (x, 0);
34270 rtx op1 = XEXP (x, 1);
34271 int nbits;
34272 if (CONST_INT_P (XEXP (x, 1)))
34274 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
34275 for (nbits = 0; value != 0; value &= value - 1)
34276 nbits++;
34278 else
34279 /* This is arbitrary. */
34280 nbits = 7;
34282 /* Compute costs correctly for widening multiplication. */
34283 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
34284 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
34285 == GET_MODE_SIZE (mode))
34287 int is_mulwiden = 0;
34288 enum machine_mode inner_mode = GET_MODE (op0);
34290 if (GET_CODE (op0) == GET_CODE (op1))
34291 is_mulwiden = 1, op1 = XEXP (op1, 0);
34292 else if (CONST_INT_P (op1))
34294 if (GET_CODE (op0) == SIGN_EXTEND)
34295 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
34296 == INTVAL (op1);
34297 else
34298 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
34301 if (is_mulwiden)
34302 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
34305 *total = (cost->mult_init[MODE_INDEX (mode)]
34306 + nbits * cost->mult_bit
34307 + rtx_cost (op0, outer_code, opno, speed)
34308 + rtx_cost (op1, outer_code, opno, speed));
34310 return true;
34313 case DIV:
34314 case UDIV:
34315 case MOD:
34316 case UMOD:
34317 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34318 /* ??? SSE cost should be used here. */
34319 *total = cost->fdiv;
34320 else if (X87_FLOAT_MODE_P (mode))
34321 *total = cost->fdiv;
34322 else if (FLOAT_MODE_P (mode))
34323 /* ??? SSE vector cost should be used here. */
34324 *total = cost->fdiv;
34325 else
34326 *total = cost->divide[MODE_INDEX (mode)];
34327 return false;
34329 case PLUS:
34330 if (GET_MODE_CLASS (mode) == MODE_INT
34331 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
34333 if (GET_CODE (XEXP (x, 0)) == PLUS
34334 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
34335 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
34336 && CONSTANT_P (XEXP (x, 1)))
34338 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
34339 if (val == 2 || val == 4 || val == 8)
34341 *total = cost->lea;
34342 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
34343 outer_code, opno, speed);
34344 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
34345 outer_code, opno, speed);
34346 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
34347 return true;
34350 else if (GET_CODE (XEXP (x, 0)) == MULT
34351 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
34353 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
34354 if (val == 2 || val == 4 || val == 8)
34356 *total = cost->lea;
34357 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
34358 outer_code, opno, speed);
34359 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
34360 return true;
34363 else if (GET_CODE (XEXP (x, 0)) == PLUS)
34365 *total = cost->lea;
34366 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
34367 outer_code, opno, speed);
34368 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
34369 outer_code, opno, speed);
34370 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
34371 return true;
34374 /* FALLTHRU */
34376 case MINUS:
34377 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34379 /* ??? SSE cost should be used here. */
34380 *total = cost->fadd;
34381 return false;
34383 else if (X87_FLOAT_MODE_P (mode))
34385 *total = cost->fadd;
34386 return false;
34388 else if (FLOAT_MODE_P (mode))
34390 /* ??? SSE vector cost should be used here. */
34391 *total = cost->fadd;
34392 return false;
34394 /* FALLTHRU */
34396 case AND:
34397 case IOR:
34398 case XOR:
34399 if (GET_MODE_CLASS (mode) == MODE_INT
34400 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34402 *total = (cost->add * 2
34403 + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
34404 << (GET_MODE (XEXP (x, 0)) != DImode))
34405 + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
34406 << (GET_MODE (XEXP (x, 1)) != DImode)));
34407 return true;
34409 /* FALLTHRU */
34411 case NEG:
34412 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34414 /* ??? SSE cost should be used here. */
34415 *total = cost->fchs;
34416 return false;
34418 else if (X87_FLOAT_MODE_P (mode))
34420 *total = cost->fchs;
34421 return false;
34423 else if (FLOAT_MODE_P (mode))
34425 /* ??? SSE vector cost should be used here. */
34426 *total = cost->fchs;
34427 return false;
34429 /* FALLTHRU */
34431 case NOT:
34432 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
34434 /* ??? Should be SSE vector operation cost. */
34435 /* At least for published AMD latencies, this really is the same
34436 as the latency for a simple fpu operation like fabs. */
34437 *total = cost->fabs;
34439 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34440 *total = cost->add * 2;
34441 else
34442 *total = cost->add;
34443 return false;
34445 case COMPARE:
34446 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
34447 && XEXP (XEXP (x, 0), 1) == const1_rtx
34448 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
34449 && XEXP (x, 1) == const0_rtx)
34451 /* This kind of construct is implemented using test[bwl].
34452 Treat it as if we had an AND. */
34453 *total = (cost->add
34454 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
34455 + rtx_cost (const1_rtx, outer_code, opno, speed));
34456 return true;
34458 return false;
34460 case FLOAT_EXTEND:
34461 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
34462 *total = 0;
34463 return false;
34465 case ABS:
34466 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34467 /* ??? SSE cost should be used here. */
34468 *total = cost->fabs;
34469 else if (X87_FLOAT_MODE_P (mode))
34470 *total = cost->fabs;
34471 else if (FLOAT_MODE_P (mode))
34472 /* ??? SSE vector cost should be used here. */
34473 *total = cost->fabs;
34474 return false;
34476 case SQRT:
34477 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34478 /* ??? SSE cost should be used here. */
34479 *total = cost->fsqrt;
34480 else if (X87_FLOAT_MODE_P (mode))
34481 *total = cost->fsqrt;
34482 else if (FLOAT_MODE_P (mode))
34483 /* ??? SSE vector cost should be used here. */
34484 *total = cost->fsqrt;
34485 return false;
34487 case UNSPEC:
34488 if (XINT (x, 1) == UNSPEC_TP)
34489 *total = 0;
34490 return false;
34492 case VEC_SELECT:
34493 case VEC_CONCAT:
34494 case VEC_MERGE:
34495 case VEC_DUPLICATE:
34496 /* ??? Assume all of these vector manipulation patterns are
34497 recognizable. In which case they all pretty much have the
34498 same cost. */
34499 *total = cost->fabs;
34500 return true;
34502 default:
34503 return false;
34507 #if TARGET_MACHO
34509 static int current_machopic_label_num;
34511 /* Given a symbol name and its associated stub, write out the
34512 definition of the stub. */
34514 void
34515 machopic_output_stub (FILE *file, const char *symb, const char *stub)
34517 unsigned int length;
34518 char *binder_name, *symbol_name, lazy_ptr_name[32];
34519 int label = ++current_machopic_label_num;
34521 /* For 64-bit we shouldn't get here. */
34522 gcc_assert (!TARGET_64BIT);
34524 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
34525 symb = targetm.strip_name_encoding (symb);
34527 length = strlen (stub);
34528 binder_name = XALLOCAVEC (char, length + 32);
34529 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
34531 length = strlen (symb);
34532 symbol_name = XALLOCAVEC (char, length + 32);
34533 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
34535 sprintf (lazy_ptr_name, "L%d$lz", label);
34537 if (MACHOPIC_ATT_STUB)
34538 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
34539 else if (MACHOPIC_PURE)
34540 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
34541 else
34542 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
34544 fprintf (file, "%s:\n", stub);
34545 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
34547 if (MACHOPIC_ATT_STUB)
34549 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
34551 else if (MACHOPIC_PURE)
34553 /* PIC stub. */
34554 /* 25-byte PIC stub using "CALL get_pc_thunk". */
34555 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
34556 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
34557 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
34558 label, lazy_ptr_name, label);
34559 fprintf (file, "\tjmp\t*%%ecx\n");
34561 else
34562 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
34564 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
34565 it needs no stub-binding-helper. */
34566 if (MACHOPIC_ATT_STUB)
34567 return;
34569 fprintf (file, "%s:\n", binder_name);
34571 if (MACHOPIC_PURE)
34573 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
34574 fprintf (file, "\tpushl\t%%ecx\n");
34576 else
34577 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
34579 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
34581 /* N.B. Keep the correspondence of these
34582 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
34583 old-pic/new-pic/non-pic stubs; altering this will break
34584 compatibility with existing dylibs. */
34585 if (MACHOPIC_PURE)
34587 /* 25-byte PIC stub using "CALL get_pc_thunk". */
34588 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
34590 else
34591 /* 16-byte -mdynamic-no-pic stub. */
34592 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
34594 fprintf (file, "%s:\n", lazy_ptr_name);
34595 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
34596 fprintf (file, ASM_LONG "%s\n", binder_name);
34598 #endif /* TARGET_MACHO */
34600 /* Order the registers for register allocator. */
34602 void
34603 x86_order_regs_for_local_alloc (void)
34605 int pos = 0;
34606 int i;
34608 /* First allocate the local general purpose registers. */
34609 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
34610 if (GENERAL_REGNO_P (i) && call_used_regs[i])
34611 reg_alloc_order [pos++] = i;
34613 /* Global general purpose registers. */
34614 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
34615 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
34616 reg_alloc_order [pos++] = i;
34618 /* x87 registers come first in case we are doing FP math
34619 using them. */
34620 if (!TARGET_SSE_MATH)
34621 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
34622 reg_alloc_order [pos++] = i;
34624 /* SSE registers. */
34625 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
34626 reg_alloc_order [pos++] = i;
34627 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
34628 reg_alloc_order [pos++] = i;
34630 /* x87 registers. */
34631 if (TARGET_SSE_MATH)
34632 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
34633 reg_alloc_order [pos++] = i;
34635 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
34636 reg_alloc_order [pos++] = i;
34638 /* Initialize the rest of array as we do not allocate some registers
34639 at all. */
34640 while (pos < FIRST_PSEUDO_REGISTER)
34641 reg_alloc_order [pos++] = 0;
34644 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
34645 in struct attribute_spec handler. */
34646 static tree
34647 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
34648 tree args,
34649 int flags ATTRIBUTE_UNUSED,
34650 bool *no_add_attrs)
34652 if (TREE_CODE (*node) != FUNCTION_TYPE
34653 && TREE_CODE (*node) != METHOD_TYPE
34654 && TREE_CODE (*node) != FIELD_DECL
34655 && TREE_CODE (*node) != TYPE_DECL)
34657 warning (OPT_Wattributes, "%qE attribute only applies to functions",
34658 name);
34659 *no_add_attrs = true;
34660 return NULL_TREE;
34662 if (TARGET_64BIT)
34664 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
34665 name);
34666 *no_add_attrs = true;
34667 return NULL_TREE;
34669 if (is_attribute_p ("callee_pop_aggregate_return", name))
34671 tree cst;
34673 cst = TREE_VALUE (args);
34674 if (TREE_CODE (cst) != INTEGER_CST)
34676 warning (OPT_Wattributes,
34677 "%qE attribute requires an integer constant argument",
34678 name);
34679 *no_add_attrs = true;
34681 else if (compare_tree_int (cst, 0) != 0
34682 && compare_tree_int (cst, 1) != 0)
34684 warning (OPT_Wattributes,
34685 "argument to %qE attribute is neither zero, nor one",
34686 name);
34687 *no_add_attrs = true;
34690 return NULL_TREE;
34693 return NULL_TREE;
34696 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
34697 struct attribute_spec.handler. */
34698 static tree
34699 ix86_handle_abi_attribute (tree *node, tree name,
34700 tree args ATTRIBUTE_UNUSED,
34701 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
34703 if (TREE_CODE (*node) != FUNCTION_TYPE
34704 && TREE_CODE (*node) != METHOD_TYPE
34705 && TREE_CODE (*node) != FIELD_DECL
34706 && TREE_CODE (*node) != TYPE_DECL)
34708 warning (OPT_Wattributes, "%qE attribute only applies to functions",
34709 name);
34710 *no_add_attrs = true;
34711 return NULL_TREE;
34714 /* Can combine regparm with all attributes but fastcall. */
34715 if (is_attribute_p ("ms_abi", name))
34717 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
34719 error ("ms_abi and sysv_abi attributes are not compatible");
34722 return NULL_TREE;
34724 else if (is_attribute_p ("sysv_abi", name))
34726 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
34728 error ("ms_abi and sysv_abi attributes are not compatible");
34731 return NULL_TREE;
34734 return NULL_TREE;
34737 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
34738 struct attribute_spec.handler. */
34739 static tree
34740 ix86_handle_struct_attribute (tree *node, tree name,
34741 tree args ATTRIBUTE_UNUSED,
34742 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
34744 tree *type = NULL;
34745 if (DECL_P (*node))
34747 if (TREE_CODE (*node) == TYPE_DECL)
34748 type = &TREE_TYPE (*node);
34750 else
34751 type = node;
34753 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
34755 warning (OPT_Wattributes, "%qE attribute ignored",
34756 name);
34757 *no_add_attrs = true;
34760 else if ((is_attribute_p ("ms_struct", name)
34761 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
34762 || ((is_attribute_p ("gcc_struct", name)
34763 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
34765 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
34766 name);
34767 *no_add_attrs = true;
34770 return NULL_TREE;
34773 static tree
34774 ix86_handle_fndecl_attribute (tree *node, tree name,
34775 tree args ATTRIBUTE_UNUSED,
34776 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
34778 if (TREE_CODE (*node) != FUNCTION_DECL)
34780 warning (OPT_Wattributes, "%qE attribute only applies to functions",
34781 name);
34782 *no_add_attrs = true;
34784 return NULL_TREE;
34787 static bool
34788 ix86_ms_bitfield_layout_p (const_tree record_type)
34790 return ((TARGET_MS_BITFIELD_LAYOUT
34791 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
34792 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
34795 /* Returns an expression indicating where the this parameter is
34796 located on entry to the FUNCTION. */
34798 static rtx
34799 x86_this_parameter (tree function)
34801 tree type = TREE_TYPE (function);
34802 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
34803 int nregs;
34805 if (TARGET_64BIT)
34807 const int *parm_regs;
34809 if (ix86_function_type_abi (type) == MS_ABI)
34810 parm_regs = x86_64_ms_abi_int_parameter_registers;
34811 else
34812 parm_regs = x86_64_int_parameter_registers;
34813 return gen_rtx_REG (Pmode, parm_regs[aggr]);
34816 nregs = ix86_function_regparm (type, function);
34818 if (nregs > 0 && !stdarg_p (type))
34820 int regno;
34821 unsigned int ccvt = ix86_get_callcvt (type);
34823 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
34824 regno = aggr ? DX_REG : CX_REG;
34825 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
34827 regno = CX_REG;
34828 if (aggr)
34829 return gen_rtx_MEM (SImode,
34830 plus_constant (Pmode, stack_pointer_rtx, 4));
34832 else
34834 regno = AX_REG;
34835 if (aggr)
34837 regno = DX_REG;
34838 if (nregs == 1)
34839 return gen_rtx_MEM (SImode,
34840 plus_constant (Pmode,
34841 stack_pointer_rtx, 4));
34844 return gen_rtx_REG (SImode, regno);
34847 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
34848 aggr ? 8 : 4));
34851 /* Determine whether x86_output_mi_thunk can succeed. */
34853 static bool
34854 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
34855 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
34856 HOST_WIDE_INT vcall_offset, const_tree function)
34858 /* 64-bit can handle anything. */
34859 if (TARGET_64BIT)
34860 return true;
34862 /* For 32-bit, everything's fine if we have one free register. */
34863 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
34864 return true;
34866 /* Need a free register for vcall_offset. */
34867 if (vcall_offset)
34868 return false;
34870 /* Need a free register for GOT references. */
34871 if (flag_pic && !targetm.binds_local_p (function))
34872 return false;
34874 /* Otherwise ok. */
34875 return true;
34878 /* Output the assembler code for a thunk function. THUNK_DECL is the
34879 declaration for the thunk function itself, FUNCTION is the decl for
34880 the target function. DELTA is an immediate constant offset to be
34881 added to THIS. If VCALL_OFFSET is nonzero, the word at
34882 *(*this + vcall_offset) should be added to THIS. */
34884 static void
34885 x86_output_mi_thunk (FILE *file,
34886 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
34887 HOST_WIDE_INT vcall_offset, tree function)
34889 rtx this_param = x86_this_parameter (function);
34890 rtx this_reg, tmp, fnaddr;
34891 unsigned int tmp_regno;
34893 if (TARGET_64BIT)
34894 tmp_regno = R10_REG;
34895 else
34897 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
34898 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
34899 tmp_regno = AX_REG;
34900 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
34901 tmp_regno = DX_REG;
34902 else
34903 tmp_regno = CX_REG;
34906 emit_note (NOTE_INSN_PROLOGUE_END);
34908 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
34909 pull it in now and let DELTA benefit. */
34910 if (REG_P (this_param))
34911 this_reg = this_param;
34912 else if (vcall_offset)
34914 /* Put the this parameter into %eax. */
34915 this_reg = gen_rtx_REG (Pmode, AX_REG);
34916 emit_move_insn (this_reg, this_param);
34918 else
34919 this_reg = NULL_RTX;
34921 /* Adjust the this parameter by a fixed constant. */
34922 if (delta)
34924 rtx delta_rtx = GEN_INT (delta);
34925 rtx delta_dst = this_reg ? this_reg : this_param;
34927 if (TARGET_64BIT)
34929 if (!x86_64_general_operand (delta_rtx, Pmode))
34931 tmp = gen_rtx_REG (Pmode, tmp_regno);
34932 emit_move_insn (tmp, delta_rtx);
34933 delta_rtx = tmp;
34937 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
34940 /* Adjust the this parameter by a value stored in the vtable. */
34941 if (vcall_offset)
34943 rtx vcall_addr, vcall_mem, this_mem;
34945 tmp = gen_rtx_REG (Pmode, tmp_regno);
34947 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
34948 if (Pmode != ptr_mode)
34949 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
34950 emit_move_insn (tmp, this_mem);
34952 /* Adjust the this parameter. */
34953 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
34954 if (TARGET_64BIT
34955 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
34957 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
34958 emit_move_insn (tmp2, GEN_INT (vcall_offset));
34959 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
34962 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
34963 if (Pmode != ptr_mode)
34964 emit_insn (gen_addsi_1_zext (this_reg,
34965 gen_rtx_REG (ptr_mode,
34966 REGNO (this_reg)),
34967 vcall_mem));
34968 else
34969 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
34972 /* If necessary, drop THIS back to its stack slot. */
34973 if (this_reg && this_reg != this_param)
34974 emit_move_insn (this_param, this_reg);
34976 fnaddr = XEXP (DECL_RTL (function), 0);
34977 if (TARGET_64BIT)
34979 if (!flag_pic || targetm.binds_local_p (function)
34980 || cfun->machine->call_abi == MS_ABI)
34982 else
34984 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
34985 tmp = gen_rtx_CONST (Pmode, tmp);
34986 fnaddr = gen_rtx_MEM (Pmode, tmp);
34989 else
34991 if (!flag_pic || targetm.binds_local_p (function))
34993 #if TARGET_MACHO
34994 else if (TARGET_MACHO)
34996 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
34997 fnaddr = XEXP (fnaddr, 0);
34999 #endif /* TARGET_MACHO */
35000 else
35002 tmp = gen_rtx_REG (Pmode, CX_REG);
35003 output_set_got (tmp, NULL_RTX);
35005 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
35006 fnaddr = gen_rtx_PLUS (Pmode, fnaddr, tmp);
35007 fnaddr = gen_rtx_MEM (Pmode, fnaddr);
35011 /* Our sibling call patterns do not allow memories, because we have no
35012 predicate that can distinguish between frame and non-frame memory.
35013 For our purposes here, we can get away with (ab)using a jump pattern,
35014 because we're going to do no optimization. */
35015 if (MEM_P (fnaddr))
35016 emit_jump_insn (gen_indirect_jump (fnaddr));
35017 else
35019 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
35020 fnaddr = legitimize_pic_address (fnaddr,
35021 gen_rtx_REG (Pmode, tmp_regno));
35023 if (!sibcall_insn_operand (fnaddr, word_mode))
35025 tmp = gen_rtx_REG (word_mode, tmp_regno);
35026 if (GET_MODE (fnaddr) != word_mode)
35027 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
35028 emit_move_insn (tmp, fnaddr);
35029 fnaddr = tmp;
35032 tmp = gen_rtx_MEM (QImode, fnaddr);
35033 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
35034 tmp = emit_call_insn (tmp);
35035 SIBLING_CALL_P (tmp) = 1;
35037 emit_barrier ();
35039 /* Emit just enough of rest_of_compilation to get the insns emitted.
35040 Note that use_thunk calls assemble_start_function et al. */
35041 tmp = get_insns ();
35042 shorten_branches (tmp);
35043 final_start_function (tmp, file, 1);
35044 final (tmp, file, 1);
35045 final_end_function ();
35048 static void
35049 x86_file_start (void)
35051 default_file_start ();
35052 #if TARGET_MACHO
35053 darwin_file_start ();
35054 #endif
35055 if (X86_FILE_START_VERSION_DIRECTIVE)
35056 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
35057 if (X86_FILE_START_FLTUSED)
35058 fputs ("\t.global\t__fltused\n", asm_out_file);
35059 if (ix86_asm_dialect == ASM_INTEL)
35060 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
35064 x86_field_alignment (tree field, int computed)
35066 enum machine_mode mode;
35067 tree type = TREE_TYPE (field);
35069 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
35070 return computed;
35071 mode = TYPE_MODE (strip_array_types (type));
35072 if (mode == DFmode || mode == DCmode
35073 || GET_MODE_CLASS (mode) == MODE_INT
35074 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
35075 return MIN (32, computed);
35076 return computed;
35079 /* Output assembler code to FILE to increment profiler label # LABELNO
35080 for profiling a function entry. */
35081 void
35082 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
35084 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
35085 : MCOUNT_NAME);
35087 if (TARGET_64BIT)
35089 #ifndef NO_PROFILE_COUNTERS
35090 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
35091 #endif
35093 if (DEFAULT_ABI == SYSV_ABI && flag_pic)
35094 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
35095 else
35096 fprintf (file, "\tcall\t%s\n", mcount_name);
35098 else if (flag_pic)
35100 #ifndef NO_PROFILE_COUNTERS
35101 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
35102 LPREFIX, labelno);
35103 #endif
35104 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
35106 else
35108 #ifndef NO_PROFILE_COUNTERS
35109 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
35110 LPREFIX, labelno);
35111 #endif
35112 fprintf (file, "\tcall\t%s\n", mcount_name);
35116 /* We don't have exact information about the insn sizes, but we may assume
35117 quite safely that we are informed about all 1 byte insns and memory
35118 address sizes. This is enough to eliminate unnecessary padding in
35119 99% of cases. */
35121 static int
35122 min_insn_size (rtx insn)
35124 int l = 0, len;
35126 if (!INSN_P (insn) || !active_insn_p (insn))
35127 return 0;
35129 /* Discard alignments we've emit and jump instructions. */
35130 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
35131 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
35132 return 0;
35133 if (JUMP_TABLE_DATA_P (insn))
35134 return 0;
35136 /* Important case - calls are always 5 bytes.
35137 It is common to have many calls in the row. */
35138 if (CALL_P (insn)
35139 && symbolic_reference_mentioned_p (PATTERN (insn))
35140 && !SIBLING_CALL_P (insn))
35141 return 5;
35142 len = get_attr_length (insn);
35143 if (len <= 1)
35144 return 1;
35146 /* For normal instructions we rely on get_attr_length being exact,
35147 with a few exceptions. */
35148 if (!JUMP_P (insn))
35150 enum attr_type type = get_attr_type (insn);
35152 switch (type)
35154 case TYPE_MULTI:
35155 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
35156 || asm_noperands (PATTERN (insn)) >= 0)
35157 return 0;
35158 break;
35159 case TYPE_OTHER:
35160 case TYPE_FCMP:
35161 break;
35162 default:
35163 /* Otherwise trust get_attr_length. */
35164 return len;
35167 l = get_attr_length_address (insn);
35168 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
35169 l = 4;
35171 if (l)
35172 return 1+l;
35173 else
35174 return 2;
35177 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
35179 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
35180 window. */
35182 static void
35183 ix86_avoid_jump_mispredicts (void)
35185 rtx insn, start = get_insns ();
35186 int nbytes = 0, njumps = 0;
35187 int isjump = 0;
35189 /* Look for all minimal intervals of instructions containing 4 jumps.
35190 The intervals are bounded by START and INSN. NBYTES is the total
35191 size of instructions in the interval including INSN and not including
35192 START. When the NBYTES is smaller than 16 bytes, it is possible
35193 that the end of START and INSN ends up in the same 16byte page.
35195 The smallest offset in the page INSN can start is the case where START
35196 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
35197 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
35199 for (insn = start; insn; insn = NEXT_INSN (insn))
35201 int min_size;
35203 if (LABEL_P (insn))
35205 int align = label_to_alignment (insn);
35206 int max_skip = label_to_max_skip (insn);
35208 if (max_skip > 15)
35209 max_skip = 15;
35210 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
35211 already in the current 16 byte page, because otherwise
35212 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
35213 bytes to reach 16 byte boundary. */
35214 if (align <= 0
35215 || (align <= 3 && max_skip != (1 << align) - 1))
35216 max_skip = 0;
35217 if (dump_file)
35218 fprintf (dump_file, "Label %i with max_skip %i\n",
35219 INSN_UID (insn), max_skip);
35220 if (max_skip)
35222 while (nbytes + max_skip >= 16)
35224 start = NEXT_INSN (start);
35225 if ((JUMP_P (start)
35226 && GET_CODE (PATTERN (start)) != ADDR_VEC
35227 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
35228 || CALL_P (start))
35229 njumps--, isjump = 1;
35230 else
35231 isjump = 0;
35232 nbytes -= min_insn_size (start);
35235 continue;
35238 min_size = min_insn_size (insn);
35239 nbytes += min_size;
35240 if (dump_file)
35241 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
35242 INSN_UID (insn), min_size);
35243 if ((JUMP_P (insn)
35244 && GET_CODE (PATTERN (insn)) != ADDR_VEC
35245 && GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC)
35246 || CALL_P (insn))
35247 njumps++;
35248 else
35249 continue;
35251 while (njumps > 3)
35253 start = NEXT_INSN (start);
35254 if ((JUMP_P (start)
35255 && GET_CODE (PATTERN (start)) != ADDR_VEC
35256 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
35257 || CALL_P (start))
35258 njumps--, isjump = 1;
35259 else
35260 isjump = 0;
35261 nbytes -= min_insn_size (start);
35263 gcc_assert (njumps >= 0);
35264 if (dump_file)
35265 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
35266 INSN_UID (start), INSN_UID (insn), nbytes);
35268 if (njumps == 3 && isjump && nbytes < 16)
35270 int padsize = 15 - nbytes + min_insn_size (insn);
35272 if (dump_file)
35273 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
35274 INSN_UID (insn), padsize);
35275 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
35279 #endif
35281 /* AMD Athlon works faster
35282 when RET is not destination of conditional jump or directly preceded
35283 by other jump instruction. We avoid the penalty by inserting NOP just
35284 before the RET instructions in such cases. */
35285 static void
35286 ix86_pad_returns (void)
35288 edge e;
35289 edge_iterator ei;
35291 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
35293 basic_block bb = e->src;
35294 rtx ret = BB_END (bb);
35295 rtx prev;
35296 bool replace = false;
35298 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
35299 || optimize_bb_for_size_p (bb))
35300 continue;
35301 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
35302 if (active_insn_p (prev) || LABEL_P (prev))
35303 break;
35304 if (prev && LABEL_P (prev))
35306 edge e;
35307 edge_iterator ei;
35309 FOR_EACH_EDGE (e, ei, bb->preds)
35310 if (EDGE_FREQUENCY (e) && e->src->index >= 0
35311 && !(e->flags & EDGE_FALLTHRU))
35312 replace = true;
35314 if (!replace)
35316 prev = prev_active_insn (ret);
35317 if (prev
35318 && ((JUMP_P (prev) && any_condjump_p (prev))
35319 || CALL_P (prev)))
35320 replace = true;
35321 /* Empty functions get branch mispredict even when
35322 the jump destination is not visible to us. */
35323 if (!prev && !optimize_function_for_size_p (cfun))
35324 replace = true;
35326 if (replace)
35328 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
35329 delete_insn (ret);
35334 /* Count the minimum number of instructions in BB. Return 4 if the
35335 number of instructions >= 4. */
35337 static int
35338 ix86_count_insn_bb (basic_block bb)
35340 rtx insn;
35341 int insn_count = 0;
35343 /* Count number of instructions in this block. Return 4 if the number
35344 of instructions >= 4. */
35345 FOR_BB_INSNS (bb, insn)
35347 /* Only happen in exit blocks. */
35348 if (JUMP_P (insn)
35349 && ANY_RETURN_P (PATTERN (insn)))
35350 break;
35352 if (NONDEBUG_INSN_P (insn)
35353 && GET_CODE (PATTERN (insn)) != USE
35354 && GET_CODE (PATTERN (insn)) != CLOBBER)
35356 insn_count++;
35357 if (insn_count >= 4)
35358 return insn_count;
35362 return insn_count;
35366 /* Count the minimum number of instructions in code path in BB.
35367 Return 4 if the number of instructions >= 4. */
35369 static int
35370 ix86_count_insn (basic_block bb)
35372 edge e;
35373 edge_iterator ei;
35374 int min_prev_count;
35376 /* Only bother counting instructions along paths with no
35377 more than 2 basic blocks between entry and exit. Given
35378 that BB has an edge to exit, determine if a predecessor
35379 of BB has an edge from entry. If so, compute the number
35380 of instructions in the predecessor block. If there
35381 happen to be multiple such blocks, compute the minimum. */
35382 min_prev_count = 4;
35383 FOR_EACH_EDGE (e, ei, bb->preds)
35385 edge prev_e;
35386 edge_iterator prev_ei;
35388 if (e->src == ENTRY_BLOCK_PTR)
35390 min_prev_count = 0;
35391 break;
35393 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
35395 if (prev_e->src == ENTRY_BLOCK_PTR)
35397 int count = ix86_count_insn_bb (e->src);
35398 if (count < min_prev_count)
35399 min_prev_count = count;
35400 break;
35405 if (min_prev_count < 4)
35406 min_prev_count += ix86_count_insn_bb (bb);
35408 return min_prev_count;
35411 /* Pad short function to 4 instructions. */
35413 static void
35414 ix86_pad_short_function (void)
35416 edge e;
35417 edge_iterator ei;
35419 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
35421 rtx ret = BB_END (e->src);
35422 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
35424 int insn_count = ix86_count_insn (e->src);
35426 /* Pad short function. */
35427 if (insn_count < 4)
35429 rtx insn = ret;
35431 /* Find epilogue. */
35432 while (insn
35433 && (!NOTE_P (insn)
35434 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
35435 insn = PREV_INSN (insn);
35437 if (!insn)
35438 insn = ret;
35440 /* Two NOPs count as one instruction. */
35441 insn_count = 2 * (4 - insn_count);
35442 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
35448 /* Implement machine specific optimizations. We implement padding of returns
35449 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
35450 static void
35451 ix86_reorg (void)
35453 /* We are freeing block_for_insn in the toplev to keep compatibility
35454 with old MDEP_REORGS that are not CFG based. Recompute it now. */
35455 compute_bb_for_insn ();
35457 if (optimize && optimize_function_for_speed_p (cfun))
35459 if (TARGET_PAD_SHORT_FUNCTION)
35460 ix86_pad_short_function ();
35461 else if (TARGET_PAD_RETURNS)
35462 ix86_pad_returns ();
35463 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
35464 if (TARGET_FOUR_JUMP_LIMIT)
35465 ix86_avoid_jump_mispredicts ();
35466 #endif
35470 /* Return nonzero when QImode register that must be represented via REX prefix
35471 is used. */
35472 bool
35473 x86_extended_QIreg_mentioned_p (rtx insn)
35475 int i;
35476 extract_insn_cached (insn);
35477 for (i = 0; i < recog_data.n_operands; i++)
35478 if (GENERAL_REG_P (recog_data.operand[i])
35479 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
35480 return true;
35481 return false;
35484 /* Return nonzero when P points to register encoded via REX prefix.
35485 Called via for_each_rtx. */
35486 static int
35487 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
35489 unsigned int regno;
35490 if (!REG_P (*p))
35491 return 0;
35492 regno = REGNO (*p);
35493 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
35496 /* Return true when INSN mentions register that must be encoded using REX
35497 prefix. */
35498 bool
35499 x86_extended_reg_mentioned_p (rtx insn)
35501 return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
35502 extended_reg_mentioned_1, NULL);
35505 /* If profitable, negate (without causing overflow) integer constant
35506 of mode MODE at location LOC. Return true in this case. */
35507 bool
35508 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
35510 HOST_WIDE_INT val;
35512 if (!CONST_INT_P (*loc))
35513 return false;
35515 switch (mode)
35517 case DImode:
35518 /* DImode x86_64 constants must fit in 32 bits. */
35519 gcc_assert (x86_64_immediate_operand (*loc, mode));
35521 mode = SImode;
35522 break;
35524 case SImode:
35525 case HImode:
35526 case QImode:
35527 break;
35529 default:
35530 gcc_unreachable ();
35533 /* Avoid overflows. */
35534 if (mode_signbit_p (mode, *loc))
35535 return false;
35537 val = INTVAL (*loc);
35539 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
35540 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
35541 if ((val < 0 && val != -128)
35542 || val == 128)
35544 *loc = GEN_INT (-val);
35545 return true;
35548 return false;
35551 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
35552 optabs would emit if we didn't have TFmode patterns. */
35554 void
35555 x86_emit_floatuns (rtx operands[2])
35557 rtx neglab, donelab, i0, i1, f0, in, out;
35558 enum machine_mode mode, inmode;
35560 inmode = GET_MODE (operands[1]);
35561 gcc_assert (inmode == SImode || inmode == DImode);
35563 out = operands[0];
35564 in = force_reg (inmode, operands[1]);
35565 mode = GET_MODE (out);
35566 neglab = gen_label_rtx ();
35567 donelab = gen_label_rtx ();
35568 f0 = gen_reg_rtx (mode);
35570 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
35572 expand_float (out, in, 0);
35574 emit_jump_insn (gen_jump (donelab));
35575 emit_barrier ();
35577 emit_label (neglab);
35579 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
35580 1, OPTAB_DIRECT);
35581 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
35582 1, OPTAB_DIRECT);
35583 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
35585 expand_float (f0, i0, 0);
35587 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
35589 emit_label (donelab);
35592 /* AVX2 does support 32-byte integer vector operations,
35593 thus the longest vector we are faced with is V32QImode. */
35594 #define MAX_VECT_LEN 32
35596 struct expand_vec_perm_d
35598 rtx target, op0, op1;
35599 unsigned char perm[MAX_VECT_LEN];
35600 enum machine_mode vmode;
35601 unsigned char nelt;
35602 bool one_operand_p;
35603 bool testing_p;
35606 static bool canonicalize_perm (struct expand_vec_perm_d *d);
35607 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
35608 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
35610 /* Get a vector mode of the same size as the original but with elements
35611 twice as wide. This is only guaranteed to apply to integral vectors. */
35613 static inline enum machine_mode
35614 get_mode_wider_vector (enum machine_mode o)
35616 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
35617 enum machine_mode n = GET_MODE_WIDER_MODE (o);
35618 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
35619 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
35620 return n;
35623 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
35624 with all elements equal to VAR. Return true if successful. */
35626 static bool
35627 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
35628 rtx target, rtx val)
35630 bool ok;
35632 switch (mode)
35634 case V2SImode:
35635 case V2SFmode:
35636 if (!mmx_ok)
35637 return false;
35638 /* FALLTHRU */
35640 case V4DFmode:
35641 case V4DImode:
35642 case V8SFmode:
35643 case V8SImode:
35644 case V2DFmode:
35645 case V2DImode:
35646 case V4SFmode:
35647 case V4SImode:
35649 rtx insn, dup;
35651 /* First attempt to recognize VAL as-is. */
35652 dup = gen_rtx_VEC_DUPLICATE (mode, val);
35653 insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
35654 if (recog_memoized (insn) < 0)
35656 rtx seq;
35657 /* If that fails, force VAL into a register. */
35659 start_sequence ();
35660 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
35661 seq = get_insns ();
35662 end_sequence ();
35663 if (seq)
35664 emit_insn_before (seq, insn);
35666 ok = recog_memoized (insn) >= 0;
35667 gcc_assert (ok);
35670 return true;
35672 case V4HImode:
35673 if (!mmx_ok)
35674 return false;
35675 if (TARGET_SSE || TARGET_3DNOW_A)
35677 rtx x;
35679 val = gen_lowpart (SImode, val);
35680 x = gen_rtx_TRUNCATE (HImode, val);
35681 x = gen_rtx_VEC_DUPLICATE (mode, x);
35682 emit_insn (gen_rtx_SET (VOIDmode, target, x));
35683 return true;
35685 goto widen;
35687 case V8QImode:
35688 if (!mmx_ok)
35689 return false;
35690 goto widen;
35692 case V8HImode:
35693 if (TARGET_SSE2)
35695 struct expand_vec_perm_d dperm;
35696 rtx tmp1, tmp2;
35698 permute:
35699 memset (&dperm, 0, sizeof (dperm));
35700 dperm.target = target;
35701 dperm.vmode = mode;
35702 dperm.nelt = GET_MODE_NUNITS (mode);
35703 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
35704 dperm.one_operand_p = true;
35706 /* Extend to SImode using a paradoxical SUBREG. */
35707 tmp1 = gen_reg_rtx (SImode);
35708 emit_move_insn (tmp1, gen_lowpart (SImode, val));
35710 /* Insert the SImode value as low element of a V4SImode vector. */
35711 tmp2 = gen_lowpart (V4SImode, dperm.op0);
35712 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
35714 ok = (expand_vec_perm_1 (&dperm)
35715 || expand_vec_perm_broadcast_1 (&dperm));
35716 gcc_assert (ok);
35717 return ok;
35719 goto widen;
35721 case V16QImode:
35722 if (TARGET_SSE2)
35723 goto permute;
35724 goto widen;
35726 widen:
35727 /* Replicate the value once into the next wider mode and recurse. */
35729 enum machine_mode smode, wsmode, wvmode;
35730 rtx x;
35732 smode = GET_MODE_INNER (mode);
35733 wvmode = get_mode_wider_vector (mode);
35734 wsmode = GET_MODE_INNER (wvmode);
35736 val = convert_modes (wsmode, smode, val, true);
35737 x = expand_simple_binop (wsmode, ASHIFT, val,
35738 GEN_INT (GET_MODE_BITSIZE (smode)),
35739 NULL_RTX, 1, OPTAB_LIB_WIDEN);
35740 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
35742 x = gen_lowpart (wvmode, target);
35743 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
35744 gcc_assert (ok);
35745 return ok;
35748 case V16HImode:
35749 case V32QImode:
35751 enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
35752 rtx x = gen_reg_rtx (hvmode);
35754 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
35755 gcc_assert (ok);
35757 x = gen_rtx_VEC_CONCAT (mode, x, x);
35758 emit_insn (gen_rtx_SET (VOIDmode, target, x));
35760 return true;
35762 default:
35763 return false;
35767 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
35768 whose ONE_VAR element is VAR, and other elements are zero. Return true
35769 if successful. */
35771 static bool
35772 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
35773 rtx target, rtx var, int one_var)
35775 enum machine_mode vsimode;
35776 rtx new_target;
35777 rtx x, tmp;
35778 bool use_vector_set = false;
35780 switch (mode)
35782 case V2DImode:
35783 /* For SSE4.1, we normally use vector set. But if the second
35784 element is zero and inter-unit moves are OK, we use movq
35785 instead. */
35786 use_vector_set = (TARGET_64BIT
35787 && TARGET_SSE4_1
35788 && !(TARGET_INTER_UNIT_MOVES
35789 && one_var == 0));
35790 break;
35791 case V16QImode:
35792 case V4SImode:
35793 case V4SFmode:
35794 use_vector_set = TARGET_SSE4_1;
35795 break;
35796 case V8HImode:
35797 use_vector_set = TARGET_SSE2;
35798 break;
35799 case V4HImode:
35800 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
35801 break;
35802 case V32QImode:
35803 case V16HImode:
35804 case V8SImode:
35805 case V8SFmode:
35806 case V4DFmode:
35807 use_vector_set = TARGET_AVX;
35808 break;
35809 case V4DImode:
35810 /* Use ix86_expand_vector_set in 64bit mode only. */
35811 use_vector_set = TARGET_AVX && TARGET_64BIT;
35812 break;
35813 default:
35814 break;
35817 if (use_vector_set)
35819 emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
35820 var = force_reg (GET_MODE_INNER (mode), var);
35821 ix86_expand_vector_set (mmx_ok, target, var, one_var);
35822 return true;
35825 switch (mode)
35827 case V2SFmode:
35828 case V2SImode:
35829 if (!mmx_ok)
35830 return false;
35831 /* FALLTHRU */
35833 case V2DFmode:
35834 case V2DImode:
35835 if (one_var != 0)
35836 return false;
35837 var = force_reg (GET_MODE_INNER (mode), var);
35838 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
35839 emit_insn (gen_rtx_SET (VOIDmode, target, x));
35840 return true;
35842 case V4SFmode:
35843 case V4SImode:
35844 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
35845 new_target = gen_reg_rtx (mode);
35846 else
35847 new_target = target;
35848 var = force_reg (GET_MODE_INNER (mode), var);
35849 x = gen_rtx_VEC_DUPLICATE (mode, var);
35850 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
35851 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
35852 if (one_var != 0)
35854 /* We need to shuffle the value to the correct position, so
35855 create a new pseudo to store the intermediate result. */
35857 /* With SSE2, we can use the integer shuffle insns. */
35858 if (mode != V4SFmode && TARGET_SSE2)
35860 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
35861 const1_rtx,
35862 GEN_INT (one_var == 1 ? 0 : 1),
35863 GEN_INT (one_var == 2 ? 0 : 1),
35864 GEN_INT (one_var == 3 ? 0 : 1)));
35865 if (target != new_target)
35866 emit_move_insn (target, new_target);
35867 return true;
35870 /* Otherwise convert the intermediate result to V4SFmode and
35871 use the SSE1 shuffle instructions. */
35872 if (mode != V4SFmode)
35874 tmp = gen_reg_rtx (V4SFmode);
35875 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
35877 else
35878 tmp = new_target;
35880 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
35881 const1_rtx,
35882 GEN_INT (one_var == 1 ? 0 : 1),
35883 GEN_INT (one_var == 2 ? 0+4 : 1+4),
35884 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
35886 if (mode != V4SFmode)
35887 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
35888 else if (tmp != target)
35889 emit_move_insn (target, tmp);
35891 else if (target != new_target)
35892 emit_move_insn (target, new_target);
35893 return true;
35895 case V8HImode:
35896 case V16QImode:
35897 vsimode = V4SImode;
35898 goto widen;
35899 case V4HImode:
35900 case V8QImode:
35901 if (!mmx_ok)
35902 return false;
35903 vsimode = V2SImode;
35904 goto widen;
35905 widen:
35906 if (one_var != 0)
35907 return false;
35909 /* Zero extend the variable element to SImode and recurse. */
35910 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
35912 x = gen_reg_rtx (vsimode);
35913 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
35914 var, one_var))
35915 gcc_unreachable ();
35917 emit_move_insn (target, gen_lowpart (mode, x));
35918 return true;
35920 default:
35921 return false;
35925 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
35926 consisting of the values in VALS. It is known that all elements
35927 except ONE_VAR are constants. Return true if successful. */
35929 static bool
35930 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
35931 rtx target, rtx vals, int one_var)
35933 rtx var = XVECEXP (vals, 0, one_var);
35934 enum machine_mode wmode;
35935 rtx const_vec, x;
35937 const_vec = copy_rtx (vals);
35938 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
35939 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
35941 switch (mode)
35943 case V2DFmode:
35944 case V2DImode:
35945 case V2SFmode:
35946 case V2SImode:
35947 /* For the two element vectors, it's just as easy to use
35948 the general case. */
35949 return false;
35951 case V4DImode:
35952 /* Use ix86_expand_vector_set in 64bit mode only. */
35953 if (!TARGET_64BIT)
35954 return false;
35955 case V4DFmode:
35956 case V8SFmode:
35957 case V8SImode:
35958 case V16HImode:
35959 case V32QImode:
35960 case V4SFmode:
35961 case V4SImode:
35962 case V8HImode:
35963 case V4HImode:
35964 break;
35966 case V16QImode:
35967 if (TARGET_SSE4_1)
35968 break;
35969 wmode = V8HImode;
35970 goto widen;
35971 case V8QImode:
35972 wmode = V4HImode;
35973 goto widen;
35974 widen:
35975 /* There's no way to set one QImode entry easily. Combine
35976 the variable value with its adjacent constant value, and
35977 promote to an HImode set. */
35978 x = XVECEXP (vals, 0, one_var ^ 1);
35979 if (one_var & 1)
35981 var = convert_modes (HImode, QImode, var, true);
35982 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
35983 NULL_RTX, 1, OPTAB_LIB_WIDEN);
35984 x = GEN_INT (INTVAL (x) & 0xff);
35986 else
35988 var = convert_modes (HImode, QImode, var, true);
35989 x = gen_int_mode (INTVAL (x) << 8, HImode);
35991 if (x != const0_rtx)
35992 var = expand_simple_binop (HImode, IOR, var, x, var,
35993 1, OPTAB_LIB_WIDEN);
35995 x = gen_reg_rtx (wmode);
35996 emit_move_insn (x, gen_lowpart (wmode, const_vec));
35997 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
35999 emit_move_insn (target, gen_lowpart (mode, x));
36000 return true;
36002 default:
36003 return false;
36006 emit_move_insn (target, const_vec);
36007 ix86_expand_vector_set (mmx_ok, target, var, one_var);
36008 return true;
36011 /* A subroutine of ix86_expand_vector_init_general. Use vector
36012 concatenate to handle the most general case: all values variable,
36013 and none identical. */
36015 static void
36016 ix86_expand_vector_init_concat (enum machine_mode mode,
36017 rtx target, rtx *ops, int n)
36019 enum machine_mode cmode, hmode = VOIDmode;
36020 rtx first[8], second[4];
36021 rtvec v;
36022 int i, j;
36024 switch (n)
36026 case 2:
36027 switch (mode)
36029 case V8SImode:
36030 cmode = V4SImode;
36031 break;
36032 case V8SFmode:
36033 cmode = V4SFmode;
36034 break;
36035 case V4DImode:
36036 cmode = V2DImode;
36037 break;
36038 case V4DFmode:
36039 cmode = V2DFmode;
36040 break;
36041 case V4SImode:
36042 cmode = V2SImode;
36043 break;
36044 case V4SFmode:
36045 cmode = V2SFmode;
36046 break;
36047 case V2DImode:
36048 cmode = DImode;
36049 break;
36050 case V2SImode:
36051 cmode = SImode;
36052 break;
36053 case V2DFmode:
36054 cmode = DFmode;
36055 break;
36056 case V2SFmode:
36057 cmode = SFmode;
36058 break;
36059 default:
36060 gcc_unreachable ();
36063 if (!register_operand (ops[1], cmode))
36064 ops[1] = force_reg (cmode, ops[1]);
36065 if (!register_operand (ops[0], cmode))
36066 ops[0] = force_reg (cmode, ops[0]);
36067 emit_insn (gen_rtx_SET (VOIDmode, target,
36068 gen_rtx_VEC_CONCAT (mode, ops[0],
36069 ops[1])));
36070 break;
36072 case 4:
36073 switch (mode)
36075 case V4DImode:
36076 cmode = V2DImode;
36077 break;
36078 case V4DFmode:
36079 cmode = V2DFmode;
36080 break;
36081 case V4SImode:
36082 cmode = V2SImode;
36083 break;
36084 case V4SFmode:
36085 cmode = V2SFmode;
36086 break;
36087 default:
36088 gcc_unreachable ();
36090 goto half;
36092 case 8:
36093 switch (mode)
36095 case V8SImode:
36096 cmode = V2SImode;
36097 hmode = V4SImode;
36098 break;
36099 case V8SFmode:
36100 cmode = V2SFmode;
36101 hmode = V4SFmode;
36102 break;
36103 default:
36104 gcc_unreachable ();
36106 goto half;
36108 half:
36109 /* FIXME: We process inputs backward to help RA. PR 36222. */
36110 i = n - 1;
36111 j = (n >> 1) - 1;
36112 for (; i > 0; i -= 2, j--)
36114 first[j] = gen_reg_rtx (cmode);
36115 v = gen_rtvec (2, ops[i - 1], ops[i]);
36116 ix86_expand_vector_init (false, first[j],
36117 gen_rtx_PARALLEL (cmode, v));
36120 n >>= 1;
36121 if (n > 2)
36123 gcc_assert (hmode != VOIDmode);
36124 for (i = j = 0; i < n; i += 2, j++)
36126 second[j] = gen_reg_rtx (hmode);
36127 ix86_expand_vector_init_concat (hmode, second [j],
36128 &first [i], 2);
36130 n >>= 1;
36131 ix86_expand_vector_init_concat (mode, target, second, n);
36133 else
36134 ix86_expand_vector_init_concat (mode, target, first, n);
36135 break;
36137 default:
36138 gcc_unreachable ();
36142 /* A subroutine of ix86_expand_vector_init_general. Use vector
36143 interleave to handle the most general case: all values variable,
36144 and none identical. */
36146 static void
36147 ix86_expand_vector_init_interleave (enum machine_mode mode,
36148 rtx target, rtx *ops, int n)
36150 enum machine_mode first_imode, second_imode, third_imode, inner_mode;
36151 int i, j;
36152 rtx op0, op1;
36153 rtx (*gen_load_even) (rtx, rtx, rtx);
36154 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
36155 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
36157 switch (mode)
36159 case V8HImode:
36160 gen_load_even = gen_vec_setv8hi;
36161 gen_interleave_first_low = gen_vec_interleave_lowv4si;
36162 gen_interleave_second_low = gen_vec_interleave_lowv2di;
36163 inner_mode = HImode;
36164 first_imode = V4SImode;
36165 second_imode = V2DImode;
36166 third_imode = VOIDmode;
36167 break;
36168 case V16QImode:
36169 gen_load_even = gen_vec_setv16qi;
36170 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
36171 gen_interleave_second_low = gen_vec_interleave_lowv4si;
36172 inner_mode = QImode;
36173 first_imode = V8HImode;
36174 second_imode = V4SImode;
36175 third_imode = V2DImode;
36176 break;
36177 default:
36178 gcc_unreachable ();
36181 for (i = 0; i < n; i++)
36183 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
36184 op0 = gen_reg_rtx (SImode);
36185 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
36187 /* Insert the SImode value as low element of V4SImode vector. */
36188 op1 = gen_reg_rtx (V4SImode);
36189 op0 = gen_rtx_VEC_MERGE (V4SImode,
36190 gen_rtx_VEC_DUPLICATE (V4SImode,
36191 op0),
36192 CONST0_RTX (V4SImode),
36193 const1_rtx);
36194 emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
36196 /* Cast the V4SImode vector back to a vector in orignal mode. */
36197 op0 = gen_reg_rtx (mode);
36198 emit_move_insn (op0, gen_lowpart (mode, op1));
36200 /* Load even elements into the second positon. */
36201 emit_insn (gen_load_even (op0,
36202 force_reg (inner_mode,
36203 ops [i + i + 1]),
36204 const1_rtx));
36206 /* Cast vector to FIRST_IMODE vector. */
36207 ops[i] = gen_reg_rtx (first_imode);
36208 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
36211 /* Interleave low FIRST_IMODE vectors. */
36212 for (i = j = 0; i < n; i += 2, j++)
36214 op0 = gen_reg_rtx (first_imode);
36215 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
36217 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
36218 ops[j] = gen_reg_rtx (second_imode);
36219 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
36222 /* Interleave low SECOND_IMODE vectors. */
36223 switch (second_imode)
36225 case V4SImode:
36226 for (i = j = 0; i < n / 2; i += 2, j++)
36228 op0 = gen_reg_rtx (second_imode);
36229 emit_insn (gen_interleave_second_low (op0, ops[i],
36230 ops[i + 1]));
36232 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
36233 vector. */
36234 ops[j] = gen_reg_rtx (third_imode);
36235 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
36237 second_imode = V2DImode;
36238 gen_interleave_second_low = gen_vec_interleave_lowv2di;
36239 /* FALLTHRU */
36241 case V2DImode:
36242 op0 = gen_reg_rtx (second_imode);
36243 emit_insn (gen_interleave_second_low (op0, ops[0],
36244 ops[1]));
36246 /* Cast the SECOND_IMODE vector back to a vector on original
36247 mode. */
36248 emit_insn (gen_rtx_SET (VOIDmode, target,
36249 gen_lowpart (mode, op0)));
36250 break;
36252 default:
36253 gcc_unreachable ();
36257 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
36258 all values variable, and none identical. */
36260 static void
36261 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
36262 rtx target, rtx vals)
36264 rtx ops[32], op0, op1;
36265 enum machine_mode half_mode = VOIDmode;
36266 int n, i;
36268 switch (mode)
36270 case V2SFmode:
36271 case V2SImode:
36272 if (!mmx_ok && !TARGET_SSE)
36273 break;
36274 /* FALLTHRU */
36276 case V8SFmode:
36277 case V8SImode:
36278 case V4DFmode:
36279 case V4DImode:
36280 case V4SFmode:
36281 case V4SImode:
36282 case V2DFmode:
36283 case V2DImode:
36284 n = GET_MODE_NUNITS (mode);
36285 for (i = 0; i < n; i++)
36286 ops[i] = XVECEXP (vals, 0, i);
36287 ix86_expand_vector_init_concat (mode, target, ops, n);
36288 return;
36290 case V32QImode:
36291 half_mode = V16QImode;
36292 goto half;
36294 case V16HImode:
36295 half_mode = V8HImode;
36296 goto half;
36298 half:
36299 n = GET_MODE_NUNITS (mode);
36300 for (i = 0; i < n; i++)
36301 ops[i] = XVECEXP (vals, 0, i);
36302 op0 = gen_reg_rtx (half_mode);
36303 op1 = gen_reg_rtx (half_mode);
36304 ix86_expand_vector_init_interleave (half_mode, op0, ops,
36305 n >> 2);
36306 ix86_expand_vector_init_interleave (half_mode, op1,
36307 &ops [n >> 1], n >> 2);
36308 emit_insn (gen_rtx_SET (VOIDmode, target,
36309 gen_rtx_VEC_CONCAT (mode, op0, op1)));
36310 return;
36312 case V16QImode:
36313 if (!TARGET_SSE4_1)
36314 break;
36315 /* FALLTHRU */
36317 case V8HImode:
36318 if (!TARGET_SSE2)
36319 break;
36321 /* Don't use ix86_expand_vector_init_interleave if we can't
36322 move from GPR to SSE register directly. */
36323 if (!TARGET_INTER_UNIT_MOVES)
36324 break;
36326 n = GET_MODE_NUNITS (mode);
36327 for (i = 0; i < n; i++)
36328 ops[i] = XVECEXP (vals, 0, i);
36329 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
36330 return;
36332 case V4HImode:
36333 case V8QImode:
36334 break;
36336 default:
36337 gcc_unreachable ();
36341 int i, j, n_elts, n_words, n_elt_per_word;
36342 enum machine_mode inner_mode;
36343 rtx words[4], shift;
36345 inner_mode = GET_MODE_INNER (mode);
36346 n_elts = GET_MODE_NUNITS (mode);
36347 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
36348 n_elt_per_word = n_elts / n_words;
36349 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
36351 for (i = 0; i < n_words; ++i)
36353 rtx word = NULL_RTX;
36355 for (j = 0; j < n_elt_per_word; ++j)
36357 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
36358 elt = convert_modes (word_mode, inner_mode, elt, true);
36360 if (j == 0)
36361 word = elt;
36362 else
36364 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
36365 word, 1, OPTAB_LIB_WIDEN);
36366 word = expand_simple_binop (word_mode, IOR, word, elt,
36367 word, 1, OPTAB_LIB_WIDEN);
36371 words[i] = word;
36374 if (n_words == 1)
36375 emit_move_insn (target, gen_lowpart (mode, words[0]));
36376 else if (n_words == 2)
36378 rtx tmp = gen_reg_rtx (mode);
36379 emit_clobber (tmp);
36380 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
36381 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
36382 emit_move_insn (target, tmp);
36384 else if (n_words == 4)
36386 rtx tmp = gen_reg_rtx (V4SImode);
36387 gcc_assert (word_mode == SImode);
36388 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
36389 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
36390 emit_move_insn (target, gen_lowpart (mode, tmp));
36392 else
36393 gcc_unreachable ();
36397 /* Initialize vector TARGET via VALS. Suppress the use of MMX
36398 instructions unless MMX_OK is true. */
36400 void
36401 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
36403 enum machine_mode mode = GET_MODE (target);
36404 enum machine_mode inner_mode = GET_MODE_INNER (mode);
36405 int n_elts = GET_MODE_NUNITS (mode);
36406 int n_var = 0, one_var = -1;
36407 bool all_same = true, all_const_zero = true;
36408 int i;
36409 rtx x;
36411 for (i = 0; i < n_elts; ++i)
36413 x = XVECEXP (vals, 0, i);
36414 if (!(CONST_INT_P (x)
36415 || GET_CODE (x) == CONST_DOUBLE
36416 || GET_CODE (x) == CONST_FIXED))
36417 n_var++, one_var = i;
36418 else if (x != CONST0_RTX (inner_mode))
36419 all_const_zero = false;
36420 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
36421 all_same = false;
36424 /* Constants are best loaded from the constant pool. */
36425 if (n_var == 0)
36427 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
36428 return;
36431 /* If all values are identical, broadcast the value. */
36432 if (all_same
36433 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
36434 XVECEXP (vals, 0, 0)))
36435 return;
36437 /* Values where only one field is non-constant are best loaded from
36438 the pool and overwritten via move later. */
36439 if (n_var == 1)
36441 if (all_const_zero
36442 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
36443 XVECEXP (vals, 0, one_var),
36444 one_var))
36445 return;
36447 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
36448 return;
36451 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
36454 void
36455 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
36457 enum machine_mode mode = GET_MODE (target);
36458 enum machine_mode inner_mode = GET_MODE_INNER (mode);
36459 enum machine_mode half_mode;
36460 bool use_vec_merge = false;
36461 rtx tmp;
36462 static rtx (*gen_extract[6][2]) (rtx, rtx)
36464 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
36465 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
36466 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
36467 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
36468 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
36469 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
36471 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
36473 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
36474 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
36475 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
36476 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
36477 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
36478 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
36480 int i, j, n;
36482 switch (mode)
36484 case V2SFmode:
36485 case V2SImode:
36486 if (mmx_ok)
36488 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
36489 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
36490 if (elt == 0)
36491 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
36492 else
36493 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
36494 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36495 return;
36497 break;
36499 case V2DImode:
36500 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
36501 if (use_vec_merge)
36502 break;
36504 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
36505 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
36506 if (elt == 0)
36507 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
36508 else
36509 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
36510 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36511 return;
36513 case V2DFmode:
36515 rtx op0, op1;
36517 /* For the two element vectors, we implement a VEC_CONCAT with
36518 the extraction of the other element. */
36520 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
36521 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
36523 if (elt == 0)
36524 op0 = val, op1 = tmp;
36525 else
36526 op0 = tmp, op1 = val;
36528 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
36529 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36531 return;
36533 case V4SFmode:
36534 use_vec_merge = TARGET_SSE4_1;
36535 if (use_vec_merge)
36536 break;
36538 switch (elt)
36540 case 0:
36541 use_vec_merge = true;
36542 break;
36544 case 1:
36545 /* tmp = target = A B C D */
36546 tmp = copy_to_reg (target);
36547 /* target = A A B B */
36548 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
36549 /* target = X A B B */
36550 ix86_expand_vector_set (false, target, val, 0);
36551 /* target = A X C D */
36552 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
36553 const1_rtx, const0_rtx,
36554 GEN_INT (2+4), GEN_INT (3+4)));
36555 return;
36557 case 2:
36558 /* tmp = target = A B C D */
36559 tmp = copy_to_reg (target);
36560 /* tmp = X B C D */
36561 ix86_expand_vector_set (false, tmp, val, 0);
36562 /* target = A B X D */
36563 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
36564 const0_rtx, const1_rtx,
36565 GEN_INT (0+4), GEN_INT (3+4)));
36566 return;
36568 case 3:
36569 /* tmp = target = A B C D */
36570 tmp = copy_to_reg (target);
36571 /* tmp = X B C D */
36572 ix86_expand_vector_set (false, tmp, val, 0);
36573 /* target = A B X D */
36574 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
36575 const0_rtx, const1_rtx,
36576 GEN_INT (2+4), GEN_INT (0+4)));
36577 return;
36579 default:
36580 gcc_unreachable ();
36582 break;
36584 case V4SImode:
36585 use_vec_merge = TARGET_SSE4_1;
36586 if (use_vec_merge)
36587 break;
36589 /* Element 0 handled by vec_merge below. */
36590 if (elt == 0)
36592 use_vec_merge = true;
36593 break;
36596 if (TARGET_SSE2)
36598 /* With SSE2, use integer shuffles to swap element 0 and ELT,
36599 store into element 0, then shuffle them back. */
36601 rtx order[4];
36603 order[0] = GEN_INT (elt);
36604 order[1] = const1_rtx;
36605 order[2] = const2_rtx;
36606 order[3] = GEN_INT (3);
36607 order[elt] = const0_rtx;
36609 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
36610 order[1], order[2], order[3]));
36612 ix86_expand_vector_set (false, target, val, 0);
36614 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
36615 order[1], order[2], order[3]));
36617 else
36619 /* For SSE1, we have to reuse the V4SF code. */
36620 ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
36621 gen_lowpart (SFmode, val), elt);
36623 return;
36625 case V8HImode:
36626 use_vec_merge = TARGET_SSE2;
36627 break;
36628 case V4HImode:
36629 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
36630 break;
36632 case V16QImode:
36633 use_vec_merge = TARGET_SSE4_1;
36634 break;
36636 case V8QImode:
36637 break;
36639 case V32QImode:
36640 half_mode = V16QImode;
36641 j = 0;
36642 n = 16;
36643 goto half;
36645 case V16HImode:
36646 half_mode = V8HImode;
36647 j = 1;
36648 n = 8;
36649 goto half;
36651 case V8SImode:
36652 half_mode = V4SImode;
36653 j = 2;
36654 n = 4;
36655 goto half;
36657 case V4DImode:
36658 half_mode = V2DImode;
36659 j = 3;
36660 n = 2;
36661 goto half;
36663 case V8SFmode:
36664 half_mode = V4SFmode;
36665 j = 4;
36666 n = 4;
36667 goto half;
36669 case V4DFmode:
36670 half_mode = V2DFmode;
36671 j = 5;
36672 n = 2;
36673 goto half;
36675 half:
36676 /* Compute offset. */
36677 i = elt / n;
36678 elt %= n;
36680 gcc_assert (i <= 1);
36682 /* Extract the half. */
36683 tmp = gen_reg_rtx (half_mode);
36684 emit_insn (gen_extract[j][i] (tmp, target));
36686 /* Put val in tmp at elt. */
36687 ix86_expand_vector_set (false, tmp, val, elt);
36689 /* Put it back. */
36690 emit_insn (gen_insert[j][i] (target, target, tmp));
36691 return;
36693 default:
36694 break;
36697 if (use_vec_merge)
36699 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
36700 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
36701 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36703 else
36705 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
36707 emit_move_insn (mem, target);
36709 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
36710 emit_move_insn (tmp, val);
36712 emit_move_insn (target, mem);
36716 void
36717 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
36719 enum machine_mode mode = GET_MODE (vec);
36720 enum machine_mode inner_mode = GET_MODE_INNER (mode);
36721 bool use_vec_extr = false;
36722 rtx tmp;
36724 switch (mode)
36726 case V2SImode:
36727 case V2SFmode:
36728 if (!mmx_ok)
36729 break;
36730 /* FALLTHRU */
36732 case V2DFmode:
36733 case V2DImode:
36734 use_vec_extr = true;
36735 break;
36737 case V4SFmode:
36738 use_vec_extr = TARGET_SSE4_1;
36739 if (use_vec_extr)
36740 break;
36742 switch (elt)
36744 case 0:
36745 tmp = vec;
36746 break;
36748 case 1:
36749 case 3:
36750 tmp = gen_reg_rtx (mode);
36751 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
36752 GEN_INT (elt), GEN_INT (elt),
36753 GEN_INT (elt+4), GEN_INT (elt+4)));
36754 break;
36756 case 2:
36757 tmp = gen_reg_rtx (mode);
36758 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
36759 break;
36761 default:
36762 gcc_unreachable ();
36764 vec = tmp;
36765 use_vec_extr = true;
36766 elt = 0;
36767 break;
36769 case V4SImode:
36770 use_vec_extr = TARGET_SSE4_1;
36771 if (use_vec_extr)
36772 break;
36774 if (TARGET_SSE2)
36776 switch (elt)
36778 case 0:
36779 tmp = vec;
36780 break;
36782 case 1:
36783 case 3:
36784 tmp = gen_reg_rtx (mode);
36785 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
36786 GEN_INT (elt), GEN_INT (elt),
36787 GEN_INT (elt), GEN_INT (elt)));
36788 break;
36790 case 2:
36791 tmp = gen_reg_rtx (mode);
36792 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
36793 break;
36795 default:
36796 gcc_unreachable ();
36798 vec = tmp;
36799 use_vec_extr = true;
36800 elt = 0;
36802 else
36804 /* For SSE1, we have to reuse the V4SF code. */
36805 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
36806 gen_lowpart (V4SFmode, vec), elt);
36807 return;
36809 break;
36811 case V8HImode:
36812 use_vec_extr = TARGET_SSE2;
36813 break;
36814 case V4HImode:
36815 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
36816 break;
36818 case V16QImode:
36819 use_vec_extr = TARGET_SSE4_1;
36820 break;
36822 case V8SFmode:
36823 if (TARGET_AVX)
36825 tmp = gen_reg_rtx (V4SFmode);
36826 if (elt < 4)
36827 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
36828 else
36829 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
36830 ix86_expand_vector_extract (false, target, tmp, elt & 3);
36831 return;
36833 break;
36835 case V4DFmode:
36836 if (TARGET_AVX)
36838 tmp = gen_reg_rtx (V2DFmode);
36839 if (elt < 2)
36840 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
36841 else
36842 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
36843 ix86_expand_vector_extract (false, target, tmp, elt & 1);
36844 return;
36846 break;
36848 case V32QImode:
36849 if (TARGET_AVX)
36851 tmp = gen_reg_rtx (V16QImode);
36852 if (elt < 16)
36853 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
36854 else
36855 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
36856 ix86_expand_vector_extract (false, target, tmp, elt & 15);
36857 return;
36859 break;
36861 case V16HImode:
36862 if (TARGET_AVX)
36864 tmp = gen_reg_rtx (V8HImode);
36865 if (elt < 8)
36866 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
36867 else
36868 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
36869 ix86_expand_vector_extract (false, target, tmp, elt & 7);
36870 return;
36872 break;
36874 case V8SImode:
36875 if (TARGET_AVX)
36877 tmp = gen_reg_rtx (V4SImode);
36878 if (elt < 4)
36879 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
36880 else
36881 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
36882 ix86_expand_vector_extract (false, target, tmp, elt & 3);
36883 return;
36885 break;
36887 case V4DImode:
36888 if (TARGET_AVX)
36890 tmp = gen_reg_rtx (V2DImode);
36891 if (elt < 2)
36892 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
36893 else
36894 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
36895 ix86_expand_vector_extract (false, target, tmp, elt & 1);
36896 return;
36898 break;
36900 case V8QImode:
36901 /* ??? Could extract the appropriate HImode element and shift. */
36902 default:
36903 break;
36906 if (use_vec_extr)
36908 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
36909 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
36911 /* Let the rtl optimizers know about the zero extension performed. */
36912 if (inner_mode == QImode || inner_mode == HImode)
36914 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
36915 target = gen_lowpart (SImode, target);
36918 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36920 else
36922 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
36924 emit_move_insn (mem, vec);
36926 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
36927 emit_move_insn (target, tmp);
36931 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
36932 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
36933 The upper bits of DEST are undefined, though they shouldn't cause
36934 exceptions (some bits from src or all zeros are ok). */
36936 static void
36937 emit_reduc_half (rtx dest, rtx src, int i)
36939 rtx tem;
36940 switch (GET_MODE (src))
36942 case V4SFmode:
36943 if (i == 128)
36944 tem = gen_sse_movhlps (dest, src, src);
36945 else
36946 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
36947 GEN_INT (1 + 4), GEN_INT (1 + 4));
36948 break;
36949 case V2DFmode:
36950 tem = gen_vec_interleave_highv2df (dest, src, src);
36951 break;
36952 case V16QImode:
36953 case V8HImode:
36954 case V4SImode:
36955 case V2DImode:
36956 tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest),
36957 gen_lowpart (V1TImode, src),
36958 GEN_INT (i / 2));
36959 break;
36960 case V8SFmode:
36961 if (i == 256)
36962 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
36963 else
36964 tem = gen_avx_shufps256 (dest, src, src,
36965 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
36966 break;
36967 case V4DFmode:
36968 if (i == 256)
36969 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
36970 else
36971 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
36972 break;
36973 case V32QImode:
36974 case V16HImode:
36975 case V8SImode:
36976 case V4DImode:
36977 if (i == 256)
36978 tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest),
36979 gen_lowpart (V4DImode, src),
36980 gen_lowpart (V4DImode, src),
36981 const1_rtx);
36982 else
36983 tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest),
36984 gen_lowpart (V2TImode, src),
36985 GEN_INT (i / 2));
36986 break;
36987 default:
36988 gcc_unreachable ();
36990 emit_insn (tem);
36993 /* Expand a vector reduction. FN is the binary pattern to reduce;
36994 DEST is the destination; IN is the input vector. */
36996 void
36997 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
36999 rtx half, dst, vec = in;
37000 enum machine_mode mode = GET_MODE (in);
37001 int i;
37003 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
37004 if (TARGET_SSE4_1
37005 && mode == V8HImode
37006 && fn == gen_uminv8hi3)
37008 emit_insn (gen_sse4_1_phminposuw (dest, in));
37009 return;
37012 for (i = GET_MODE_BITSIZE (mode);
37013 i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
37014 i >>= 1)
37016 half = gen_reg_rtx (mode);
37017 emit_reduc_half (half, vec, i);
37018 if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
37019 dst = dest;
37020 else
37021 dst = gen_reg_rtx (mode);
37022 emit_insn (fn (dst, half, vec));
37023 vec = dst;
37027 /* Target hook for scalar_mode_supported_p. */
37028 static bool
37029 ix86_scalar_mode_supported_p (enum machine_mode mode)
37031 if (DECIMAL_FLOAT_MODE_P (mode))
37032 return default_decimal_float_supported_p ();
37033 else if (mode == TFmode)
37034 return true;
37035 else
37036 return default_scalar_mode_supported_p (mode);
37039 /* Implements target hook vector_mode_supported_p. */
37040 static bool
37041 ix86_vector_mode_supported_p (enum machine_mode mode)
37043 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
37044 return true;
37045 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
37046 return true;
37047 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
37048 return true;
37049 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
37050 return true;
37051 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
37052 return true;
37053 return false;
37056 /* Target hook for c_mode_for_suffix. */
37057 static enum machine_mode
37058 ix86_c_mode_for_suffix (char suffix)
37060 if (suffix == 'q')
37061 return TFmode;
37062 if (suffix == 'w')
37063 return XFmode;
37065 return VOIDmode;
37068 /* Worker function for TARGET_MD_ASM_CLOBBERS.
37070 We do this in the new i386 backend to maintain source compatibility
37071 with the old cc0-based compiler. */
37073 static tree
37074 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
37075 tree inputs ATTRIBUTE_UNUSED,
37076 tree clobbers)
37078 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
37079 clobbers);
37080 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
37081 clobbers);
37082 return clobbers;
37085 /* Implements target vector targetm.asm.encode_section_info. */
37087 static void ATTRIBUTE_UNUSED
37088 ix86_encode_section_info (tree decl, rtx rtl, int first)
37090 default_encode_section_info (decl, rtl, first);
37092 if (TREE_CODE (decl) == VAR_DECL
37093 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
37094 && ix86_in_large_data_p (decl))
37095 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
37098 /* Worker function for REVERSE_CONDITION. */
37100 enum rtx_code
37101 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
37103 return (mode != CCFPmode && mode != CCFPUmode
37104 ? reverse_condition (code)
37105 : reverse_condition_maybe_unordered (code));
37108 /* Output code to perform an x87 FP register move, from OPERANDS[1]
37109 to OPERANDS[0]. */
37111 const char *
37112 output_387_reg_move (rtx insn, rtx *operands)
37114 if (REG_P (operands[0]))
37116 if (REG_P (operands[1])
37117 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
37119 if (REGNO (operands[0]) == FIRST_STACK_REG)
37120 return output_387_ffreep (operands, 0);
37121 return "fstp\t%y0";
37123 if (STACK_TOP_P (operands[0]))
37124 return "fld%Z1\t%y1";
37125 return "fst\t%y0";
37127 else if (MEM_P (operands[0]))
37129 gcc_assert (REG_P (operands[1]));
37130 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
37131 return "fstp%Z0\t%y0";
37132 else
37134 /* There is no non-popping store to memory for XFmode.
37135 So if we need one, follow the store with a load. */
37136 if (GET_MODE (operands[0]) == XFmode)
37137 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
37138 else
37139 return "fst%Z0\t%y0";
37142 else
37143 gcc_unreachable();
37146 /* Output code to perform a conditional jump to LABEL, if C2 flag in
37147 FP status register is set. */
37149 void
37150 ix86_emit_fp_unordered_jump (rtx label)
37152 rtx reg = gen_reg_rtx (HImode);
37153 rtx temp;
37155 emit_insn (gen_x86_fnstsw_1 (reg));
37157 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
37159 emit_insn (gen_x86_sahf_1 (reg));
37161 temp = gen_rtx_REG (CCmode, FLAGS_REG);
37162 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
37164 else
37166 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
37168 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
37169 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
37172 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
37173 gen_rtx_LABEL_REF (VOIDmode, label),
37174 pc_rtx);
37175 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
37177 emit_jump_insn (temp);
37178 predict_jump (REG_BR_PROB_BASE * 10 / 100);
37181 /* Output code to perform a log1p XFmode calculation. */
37183 void ix86_emit_i387_log1p (rtx op0, rtx op1)
37185 rtx label1 = gen_label_rtx ();
37186 rtx label2 = gen_label_rtx ();
37188 rtx tmp = gen_reg_rtx (XFmode);
37189 rtx tmp2 = gen_reg_rtx (XFmode);
37190 rtx test;
37192 emit_insn (gen_absxf2 (tmp, op1));
37193 test = gen_rtx_GE (VOIDmode, tmp,
37194 CONST_DOUBLE_FROM_REAL_VALUE (
37195 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
37196 XFmode));
37197 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
37199 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
37200 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
37201 emit_jump (label2);
37203 emit_label (label1);
37204 emit_move_insn (tmp, CONST1_RTX (XFmode));
37205 emit_insn (gen_addxf3 (tmp, op1, tmp));
37206 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
37207 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
37209 emit_label (label2);
37212 /* Emit code for round calculation. */
37213 void ix86_emit_i387_round (rtx op0, rtx op1)
37215 enum machine_mode inmode = GET_MODE (op1);
37216 enum machine_mode outmode = GET_MODE (op0);
37217 rtx e1, e2, res, tmp, tmp1, half;
37218 rtx scratch = gen_reg_rtx (HImode);
37219 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
37220 rtx jump_label = gen_label_rtx ();
37221 rtx insn;
37222 rtx (*gen_abs) (rtx, rtx);
37223 rtx (*gen_neg) (rtx, rtx);
37225 switch (inmode)
37227 case SFmode:
37228 gen_abs = gen_abssf2;
37229 break;
37230 case DFmode:
37231 gen_abs = gen_absdf2;
37232 break;
37233 case XFmode:
37234 gen_abs = gen_absxf2;
37235 break;
37236 default:
37237 gcc_unreachable ();
37240 switch (outmode)
37242 case SFmode:
37243 gen_neg = gen_negsf2;
37244 break;
37245 case DFmode:
37246 gen_neg = gen_negdf2;
37247 break;
37248 case XFmode:
37249 gen_neg = gen_negxf2;
37250 break;
37251 case HImode:
37252 gen_neg = gen_neghi2;
37253 break;
37254 case SImode:
37255 gen_neg = gen_negsi2;
37256 break;
37257 case DImode:
37258 gen_neg = gen_negdi2;
37259 break;
37260 default:
37261 gcc_unreachable ();
37264 e1 = gen_reg_rtx (inmode);
37265 e2 = gen_reg_rtx (inmode);
37266 res = gen_reg_rtx (outmode);
37268 half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
37270 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
37272 /* scratch = fxam(op1) */
37273 emit_insn (gen_rtx_SET (VOIDmode, scratch,
37274 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
37275 UNSPEC_FXAM)));
37276 /* e1 = fabs(op1) */
37277 emit_insn (gen_abs (e1, op1));
37279 /* e2 = e1 + 0.5 */
37280 half = force_reg (inmode, half);
37281 emit_insn (gen_rtx_SET (VOIDmode, e2,
37282 gen_rtx_PLUS (inmode, e1, half)));
37284 /* res = floor(e2) */
37285 if (inmode != XFmode)
37287 tmp1 = gen_reg_rtx (XFmode);
37289 emit_insn (gen_rtx_SET (VOIDmode, tmp1,
37290 gen_rtx_FLOAT_EXTEND (XFmode, e2)));
37292 else
37293 tmp1 = e2;
37295 switch (outmode)
37297 case SFmode:
37298 case DFmode:
37300 rtx tmp0 = gen_reg_rtx (XFmode);
37302 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
37304 emit_insn (gen_rtx_SET (VOIDmode, res,
37305 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
37306 UNSPEC_TRUNC_NOOP)));
37308 break;
37309 case XFmode:
37310 emit_insn (gen_frndintxf2_floor (res, tmp1));
37311 break;
37312 case HImode:
37313 emit_insn (gen_lfloorxfhi2 (res, tmp1));
37314 break;
37315 case SImode:
37316 emit_insn (gen_lfloorxfsi2 (res, tmp1));
37317 break;
37318 case DImode:
37319 emit_insn (gen_lfloorxfdi2 (res, tmp1));
37320 break;
37321 default:
37322 gcc_unreachable ();
37325 /* flags = signbit(a) */
37326 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
37328 /* if (flags) then res = -res */
37329 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
37330 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
37331 gen_rtx_LABEL_REF (VOIDmode, jump_label),
37332 pc_rtx);
37333 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
37334 predict_jump (REG_BR_PROB_BASE * 50 / 100);
37335 JUMP_LABEL (insn) = jump_label;
37337 emit_insn (gen_neg (res, res));
37339 emit_label (jump_label);
37340 LABEL_NUSES (jump_label) = 1;
37342 emit_move_insn (op0, res);
37345 /* Output code to perform a Newton-Rhapson approximation of a single precision
37346 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
37348 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
37350 rtx x0, x1, e0, e1;
37352 x0 = gen_reg_rtx (mode);
37353 e0 = gen_reg_rtx (mode);
37354 e1 = gen_reg_rtx (mode);
37355 x1 = gen_reg_rtx (mode);
37357 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
37359 b = force_reg (mode, b);
37361 /* x0 = rcp(b) estimate */
37362 emit_insn (gen_rtx_SET (VOIDmode, x0,
37363 gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
37364 UNSPEC_RCP)));
37365 /* e0 = x0 * b */
37366 emit_insn (gen_rtx_SET (VOIDmode, e0,
37367 gen_rtx_MULT (mode, x0, b)));
37369 /* e0 = x0 * e0 */
37370 emit_insn (gen_rtx_SET (VOIDmode, e0,
37371 gen_rtx_MULT (mode, x0, e0)));
37373 /* e1 = x0 + x0 */
37374 emit_insn (gen_rtx_SET (VOIDmode, e1,
37375 gen_rtx_PLUS (mode, x0, x0)));
37377 /* x1 = e1 - e0 */
37378 emit_insn (gen_rtx_SET (VOIDmode, x1,
37379 gen_rtx_MINUS (mode, e1, e0)));
37381 /* res = a * x1 */
37382 emit_insn (gen_rtx_SET (VOIDmode, res,
37383 gen_rtx_MULT (mode, a, x1)));
37386 /* Output code to perform a Newton-Rhapson approximation of a
37387 single precision floating point [reciprocal] square root. */
37389 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
37390 bool recip)
37392 rtx x0, e0, e1, e2, e3, mthree, mhalf;
37393 REAL_VALUE_TYPE r;
37395 x0 = gen_reg_rtx (mode);
37396 e0 = gen_reg_rtx (mode);
37397 e1 = gen_reg_rtx (mode);
37398 e2 = gen_reg_rtx (mode);
37399 e3 = gen_reg_rtx (mode);
37401 real_from_integer (&r, VOIDmode, -3, -1, 0);
37402 mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
37404 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
37405 mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
37407 if (VECTOR_MODE_P (mode))
37409 mthree = ix86_build_const_vector (mode, true, mthree);
37410 mhalf = ix86_build_const_vector (mode, true, mhalf);
37413 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
37414 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
37416 a = force_reg (mode, a);
37418 /* x0 = rsqrt(a) estimate */
37419 emit_insn (gen_rtx_SET (VOIDmode, x0,
37420 gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
37421 UNSPEC_RSQRT)));
37423 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
37424 if (!recip)
37426 rtx zero, mask;
37428 zero = gen_reg_rtx (mode);
37429 mask = gen_reg_rtx (mode);
37431 zero = force_reg (mode, CONST0_RTX(mode));
37432 emit_insn (gen_rtx_SET (VOIDmode, mask,
37433 gen_rtx_NE (mode, zero, a)));
37435 emit_insn (gen_rtx_SET (VOIDmode, x0,
37436 gen_rtx_AND (mode, x0, mask)));
37439 /* e0 = x0 * a */
37440 emit_insn (gen_rtx_SET (VOIDmode, e0,
37441 gen_rtx_MULT (mode, x0, a)));
37442 /* e1 = e0 * x0 */
37443 emit_insn (gen_rtx_SET (VOIDmode, e1,
37444 gen_rtx_MULT (mode, e0, x0)));
37446 /* e2 = e1 - 3. */
37447 mthree = force_reg (mode, mthree);
37448 emit_insn (gen_rtx_SET (VOIDmode, e2,
37449 gen_rtx_PLUS (mode, e1, mthree)));
37451 mhalf = force_reg (mode, mhalf);
37452 if (recip)
37453 /* e3 = -.5 * x0 */
37454 emit_insn (gen_rtx_SET (VOIDmode, e3,
37455 gen_rtx_MULT (mode, x0, mhalf)));
37456 else
37457 /* e3 = -.5 * e0 */
37458 emit_insn (gen_rtx_SET (VOIDmode, e3,
37459 gen_rtx_MULT (mode, e0, mhalf)));
37460 /* ret = e2 * e3 */
37461 emit_insn (gen_rtx_SET (VOIDmode, res,
37462 gen_rtx_MULT (mode, e2, e3)));
37465 #ifdef TARGET_SOLARIS
37466 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
37468 static void
37469 i386_solaris_elf_named_section (const char *name, unsigned int flags,
37470 tree decl)
37472 /* With Binutils 2.15, the "@unwind" marker must be specified on
37473 every occurrence of the ".eh_frame" section, not just the first
37474 one. */
37475 if (TARGET_64BIT
37476 && strcmp (name, ".eh_frame") == 0)
37478 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
37479 flags & SECTION_WRITE ? "aw" : "a");
37480 return;
37483 #ifndef USE_GAS
37484 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
37486 solaris_elf_asm_comdat_section (name, flags, decl);
37487 return;
37489 #endif
37491 default_elf_asm_named_section (name, flags, decl);
37493 #endif /* TARGET_SOLARIS */
37495 /* Return the mangling of TYPE if it is an extended fundamental type. */
37497 static const char *
37498 ix86_mangle_type (const_tree type)
37500 type = TYPE_MAIN_VARIANT (type);
37502 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
37503 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
37504 return NULL;
37506 switch (TYPE_MODE (type))
37508 case TFmode:
37509 /* __float128 is "g". */
37510 return "g";
37511 case XFmode:
37512 /* "long double" or __float80 is "e". */
37513 return "e";
37514 default:
37515 return NULL;
37519 /* For 32-bit code we can save PIC register setup by using
37520 __stack_chk_fail_local hidden function instead of calling
37521 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
37522 register, so it is better to call __stack_chk_fail directly. */
37524 static tree ATTRIBUTE_UNUSED
37525 ix86_stack_protect_fail (void)
37527 return TARGET_64BIT
37528 ? default_external_stack_protect_fail ()
37529 : default_hidden_stack_protect_fail ();
37532 /* Select a format to encode pointers in exception handling data. CODE
37533 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
37534 true if the symbol may be affected by dynamic relocations.
37536 ??? All x86 object file formats are capable of representing this.
37537 After all, the relocation needed is the same as for the call insn.
37538 Whether or not a particular assembler allows us to enter such, I
37539 guess we'll have to see. */
37541 asm_preferred_eh_data_format (int code, int global)
37543 if (flag_pic)
37545 int type = DW_EH_PE_sdata8;
37546 if (!TARGET_64BIT
37547 || ix86_cmodel == CM_SMALL_PIC
37548 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
37549 type = DW_EH_PE_sdata4;
37550 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
37552 if (ix86_cmodel == CM_SMALL
37553 || (ix86_cmodel == CM_MEDIUM && code))
37554 return DW_EH_PE_udata4;
37555 return DW_EH_PE_absptr;
37558 /* Expand copysign from SIGN to the positive value ABS_VALUE
37559 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
37560 the sign-bit. */
37561 static void
37562 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
37564 enum machine_mode mode = GET_MODE (sign);
37565 rtx sgn = gen_reg_rtx (mode);
37566 if (mask == NULL_RTX)
37568 enum machine_mode vmode;
37570 if (mode == SFmode)
37571 vmode = V4SFmode;
37572 else if (mode == DFmode)
37573 vmode = V2DFmode;
37574 else
37575 vmode = mode;
37577 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
37578 if (!VECTOR_MODE_P (mode))
37580 /* We need to generate a scalar mode mask in this case. */
37581 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
37582 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
37583 mask = gen_reg_rtx (mode);
37584 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
37587 else
37588 mask = gen_rtx_NOT (mode, mask);
37589 emit_insn (gen_rtx_SET (VOIDmode, sgn,
37590 gen_rtx_AND (mode, mask, sign)));
37591 emit_insn (gen_rtx_SET (VOIDmode, result,
37592 gen_rtx_IOR (mode, abs_value, sgn)));
37595 /* Expand fabs (OP0) and return a new rtx that holds the result. The
37596 mask for masking out the sign-bit is stored in *SMASK, if that is
37597 non-null. */
37598 static rtx
37599 ix86_expand_sse_fabs (rtx op0, rtx *smask)
37601 enum machine_mode vmode, mode = GET_MODE (op0);
37602 rtx xa, mask;
37604 xa = gen_reg_rtx (mode);
37605 if (mode == SFmode)
37606 vmode = V4SFmode;
37607 else if (mode == DFmode)
37608 vmode = V2DFmode;
37609 else
37610 vmode = mode;
37611 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
37612 if (!VECTOR_MODE_P (mode))
37614 /* We need to generate a scalar mode mask in this case. */
37615 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
37616 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
37617 mask = gen_reg_rtx (mode);
37618 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
37620 emit_insn (gen_rtx_SET (VOIDmode, xa,
37621 gen_rtx_AND (mode, op0, mask)));
37623 if (smask)
37624 *smask = mask;
37626 return xa;
37629 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
37630 swapping the operands if SWAP_OPERANDS is true. The expanded
37631 code is a forward jump to a newly created label in case the
37632 comparison is true. The generated label rtx is returned. */
37633 static rtx
37634 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
37635 bool swap_operands)
37637 rtx label, tmp;
37639 if (swap_operands)
37641 tmp = op0;
37642 op0 = op1;
37643 op1 = tmp;
37646 label = gen_label_rtx ();
37647 tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG);
37648 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37649 gen_rtx_COMPARE (CCFPUmode, op0, op1)));
37650 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
37651 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
37652 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
37653 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
37654 JUMP_LABEL (tmp) = label;
37656 return label;
37659 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
37660 using comparison code CODE. Operands are swapped for the comparison if
37661 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
37662 static rtx
37663 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
37664 bool swap_operands)
37666 rtx (*insn)(rtx, rtx, rtx, rtx);
37667 enum machine_mode mode = GET_MODE (op0);
37668 rtx mask = gen_reg_rtx (mode);
37670 if (swap_operands)
37672 rtx tmp = op0;
37673 op0 = op1;
37674 op1 = tmp;
37677 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
37679 emit_insn (insn (mask, op0, op1,
37680 gen_rtx_fmt_ee (code, mode, op0, op1)));
37681 return mask;
37684 /* Generate and return a rtx of mode MODE for 2**n where n is the number
37685 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
37686 static rtx
37687 ix86_gen_TWO52 (enum machine_mode mode)
37689 REAL_VALUE_TYPE TWO52r;
37690 rtx TWO52;
37692 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
37693 TWO52 = const_double_from_real_value (TWO52r, mode);
37694 TWO52 = force_reg (mode, TWO52);
37696 return TWO52;
37699 /* Expand SSE sequence for computing lround from OP1 storing
37700 into OP0. */
37701 void
37702 ix86_expand_lround (rtx op0, rtx op1)
37704 /* C code for the stuff we're doing below:
37705 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
37706 return (long)tmp;
37708 enum machine_mode mode = GET_MODE (op1);
37709 const struct real_format *fmt;
37710 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
37711 rtx adj;
37713 /* load nextafter (0.5, 0.0) */
37714 fmt = REAL_MODE_FORMAT (mode);
37715 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
37716 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
37718 /* adj = copysign (0.5, op1) */
37719 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
37720 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
37722 /* adj = op1 + adj */
37723 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
37725 /* op0 = (imode)adj */
37726 expand_fix (op0, adj, 0);
37729 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
37730 into OPERAND0. */
37731 void
37732 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
37734 /* C code for the stuff we're doing below (for do_floor):
37735 xi = (long)op1;
37736 xi -= (double)xi > op1 ? 1 : 0;
37737 return xi;
37739 enum machine_mode fmode = GET_MODE (op1);
37740 enum machine_mode imode = GET_MODE (op0);
37741 rtx ireg, freg, label, tmp;
37743 /* reg = (long)op1 */
37744 ireg = gen_reg_rtx (imode);
37745 expand_fix (ireg, op1, 0);
37747 /* freg = (double)reg */
37748 freg = gen_reg_rtx (fmode);
37749 expand_float (freg, ireg, 0);
37751 /* ireg = (freg > op1) ? ireg - 1 : ireg */
37752 label = ix86_expand_sse_compare_and_jump (UNLE,
37753 freg, op1, !do_floor);
37754 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
37755 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
37756 emit_move_insn (ireg, tmp);
37758 emit_label (label);
37759 LABEL_NUSES (label) = 1;
37761 emit_move_insn (op0, ireg);
37764 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
37765 result in OPERAND0. */
37766 void
37767 ix86_expand_rint (rtx operand0, rtx operand1)
37769 /* C code for the stuff we're doing below:
37770 xa = fabs (operand1);
37771 if (!isless (xa, 2**52))
37772 return operand1;
37773 xa = xa + 2**52 - 2**52;
37774 return copysign (xa, operand1);
37776 enum machine_mode mode = GET_MODE (operand0);
37777 rtx res, xa, label, TWO52, mask;
37779 res = gen_reg_rtx (mode);
37780 emit_move_insn (res, operand1);
37782 /* xa = abs (operand1) */
37783 xa = ix86_expand_sse_fabs (res, &mask);
37785 /* if (!isless (xa, TWO52)) goto label; */
37786 TWO52 = ix86_gen_TWO52 (mode);
37787 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37789 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
37790 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
37792 ix86_sse_copysign_to_positive (res, xa, res, mask);
37794 emit_label (label);
37795 LABEL_NUSES (label) = 1;
37797 emit_move_insn (operand0, res);
37800 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
37801 into OPERAND0. */
37802 void
37803 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
37805 /* C code for the stuff we expand below.
37806 double xa = fabs (x), x2;
37807 if (!isless (xa, TWO52))
37808 return x;
37809 xa = xa + TWO52 - TWO52;
37810 x2 = copysign (xa, x);
37811 Compensate. Floor:
37812 if (x2 > x)
37813 x2 -= 1;
37814 Compensate. Ceil:
37815 if (x2 < x)
37816 x2 -= -1;
37817 return x2;
37819 enum machine_mode mode = GET_MODE (operand0);
37820 rtx xa, TWO52, tmp, label, one, res, mask;
37822 TWO52 = ix86_gen_TWO52 (mode);
37824 /* Temporary for holding the result, initialized to the input
37825 operand to ease control flow. */
37826 res = gen_reg_rtx (mode);
37827 emit_move_insn (res, operand1);
37829 /* xa = abs (operand1) */
37830 xa = ix86_expand_sse_fabs (res, &mask);
37832 /* if (!isless (xa, TWO52)) goto label; */
37833 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37835 /* xa = xa + TWO52 - TWO52; */
37836 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
37837 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
37839 /* xa = copysign (xa, operand1) */
37840 ix86_sse_copysign_to_positive (xa, xa, res, mask);
37842 /* generate 1.0 or -1.0 */
37843 one = force_reg (mode,
37844 const_double_from_real_value (do_floor
37845 ? dconst1 : dconstm1, mode));
37847 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
37848 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
37849 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37850 gen_rtx_AND (mode, one, tmp)));
37851 /* We always need to subtract here to preserve signed zero. */
37852 tmp = expand_simple_binop (mode, MINUS,
37853 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
37854 emit_move_insn (res, tmp);
37856 emit_label (label);
37857 LABEL_NUSES (label) = 1;
37859 emit_move_insn (operand0, res);
37862 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
37863 into OPERAND0. */
37864 void
37865 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
37867 /* C code for the stuff we expand below.
37868 double xa = fabs (x), x2;
37869 if (!isless (xa, TWO52))
37870 return x;
37871 x2 = (double)(long)x;
37872 Compensate. Floor:
37873 if (x2 > x)
37874 x2 -= 1;
37875 Compensate. Ceil:
37876 if (x2 < x)
37877 x2 += 1;
37878 if (HONOR_SIGNED_ZEROS (mode))
37879 return copysign (x2, x);
37880 return x2;
37882 enum machine_mode mode = GET_MODE (operand0);
37883 rtx xa, xi, TWO52, tmp, label, one, res, mask;
37885 TWO52 = ix86_gen_TWO52 (mode);
37887 /* Temporary for holding the result, initialized to the input
37888 operand to ease control flow. */
37889 res = gen_reg_rtx (mode);
37890 emit_move_insn (res, operand1);
37892 /* xa = abs (operand1) */
37893 xa = ix86_expand_sse_fabs (res, &mask);
37895 /* if (!isless (xa, TWO52)) goto label; */
37896 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37898 /* xa = (double)(long)x */
37899 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
37900 expand_fix (xi, res, 0);
37901 expand_float (xa, xi, 0);
37903 /* generate 1.0 */
37904 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
37906 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
37907 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
37908 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37909 gen_rtx_AND (mode, one, tmp)));
37910 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
37911 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
37912 emit_move_insn (res, tmp);
37914 if (HONOR_SIGNED_ZEROS (mode))
37915 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
37917 emit_label (label);
37918 LABEL_NUSES (label) = 1;
37920 emit_move_insn (operand0, res);
37923 /* Expand SSE sequence for computing round from OPERAND1 storing
37924 into OPERAND0. Sequence that works without relying on DImode truncation
37925 via cvttsd2siq that is only available on 64bit targets. */
37926 void
37927 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
37929 /* C code for the stuff we expand below.
37930 double xa = fabs (x), xa2, x2;
37931 if (!isless (xa, TWO52))
37932 return x;
37933 Using the absolute value and copying back sign makes
37934 -0.0 -> -0.0 correct.
37935 xa2 = xa + TWO52 - TWO52;
37936 Compensate.
37937 dxa = xa2 - xa;
37938 if (dxa <= -0.5)
37939 xa2 += 1;
37940 else if (dxa > 0.5)
37941 xa2 -= 1;
37942 x2 = copysign (xa2, x);
37943 return x2;
37945 enum machine_mode mode = GET_MODE (operand0);
37946 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
37948 TWO52 = ix86_gen_TWO52 (mode);
37950 /* Temporary for holding the result, initialized to the input
37951 operand to ease control flow. */
37952 res = gen_reg_rtx (mode);
37953 emit_move_insn (res, operand1);
37955 /* xa = abs (operand1) */
37956 xa = ix86_expand_sse_fabs (res, &mask);
37958 /* if (!isless (xa, TWO52)) goto label; */
37959 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37961 /* xa2 = xa + TWO52 - TWO52; */
37962 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
37963 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
37965 /* dxa = xa2 - xa; */
37966 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
37968 /* generate 0.5, 1.0 and -0.5 */
37969 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
37970 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
37971 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
37972 0, OPTAB_DIRECT);
37974 /* Compensate. */
37975 tmp = gen_reg_rtx (mode);
37976 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
37977 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
37978 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37979 gen_rtx_AND (mode, one, tmp)));
37980 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
37981 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
37982 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
37983 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37984 gen_rtx_AND (mode, one, tmp)));
37985 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
37987 /* res = copysign (xa2, operand1) */
37988 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
37990 emit_label (label);
37991 LABEL_NUSES (label) = 1;
37993 emit_move_insn (operand0, res);
37996 /* Expand SSE sequence for computing trunc from OPERAND1 storing
37997 into OPERAND0. */
37998 void
37999 ix86_expand_trunc (rtx operand0, rtx operand1)
38001 /* C code for SSE variant we expand below.
38002 double xa = fabs (x), x2;
38003 if (!isless (xa, TWO52))
38004 return x;
38005 x2 = (double)(long)x;
38006 if (HONOR_SIGNED_ZEROS (mode))
38007 return copysign (x2, x);
38008 return x2;
38010 enum machine_mode mode = GET_MODE (operand0);
38011 rtx xa, xi, TWO52, label, res, mask;
38013 TWO52 = ix86_gen_TWO52 (mode);
38015 /* Temporary for holding the result, initialized to the input
38016 operand to ease control flow. */
38017 res = gen_reg_rtx (mode);
38018 emit_move_insn (res, operand1);
38020 /* xa = abs (operand1) */
38021 xa = ix86_expand_sse_fabs (res, &mask);
38023 /* if (!isless (xa, TWO52)) goto label; */
38024 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38026 /* x = (double)(long)x */
38027 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
38028 expand_fix (xi, res, 0);
38029 expand_float (res, xi, 0);
38031 if (HONOR_SIGNED_ZEROS (mode))
38032 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
38034 emit_label (label);
38035 LABEL_NUSES (label) = 1;
38037 emit_move_insn (operand0, res);
38040 /* Expand SSE sequence for computing trunc from OPERAND1 storing
38041 into OPERAND0. */
38042 void
38043 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
38045 enum machine_mode mode = GET_MODE (operand0);
38046 rtx xa, mask, TWO52, label, one, res, smask, tmp;
38048 /* C code for SSE variant we expand below.
38049 double xa = fabs (x), x2;
38050 if (!isless (xa, TWO52))
38051 return x;
38052 xa2 = xa + TWO52 - TWO52;
38053 Compensate:
38054 if (xa2 > xa)
38055 xa2 -= 1.0;
38056 x2 = copysign (xa2, x);
38057 return x2;
38060 TWO52 = ix86_gen_TWO52 (mode);
38062 /* Temporary for holding the result, initialized to the input
38063 operand to ease control flow. */
38064 res = gen_reg_rtx (mode);
38065 emit_move_insn (res, operand1);
38067 /* xa = abs (operand1) */
38068 xa = ix86_expand_sse_fabs (res, &smask);
38070 /* if (!isless (xa, TWO52)) goto label; */
38071 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38073 /* res = xa + TWO52 - TWO52; */
38074 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
38075 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
38076 emit_move_insn (res, tmp);
38078 /* generate 1.0 */
38079 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
38081 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
38082 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
38083 emit_insn (gen_rtx_SET (VOIDmode, mask,
38084 gen_rtx_AND (mode, mask, one)));
38085 tmp = expand_simple_binop (mode, MINUS,
38086 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
38087 emit_move_insn (res, tmp);
38089 /* res = copysign (res, operand1) */
38090 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
38092 emit_label (label);
38093 LABEL_NUSES (label) = 1;
38095 emit_move_insn (operand0, res);
38098 /* Expand SSE sequence for computing round from OPERAND1 storing
38099 into OPERAND0. */
38100 void
38101 ix86_expand_round (rtx operand0, rtx operand1)
38103 /* C code for the stuff we're doing below:
38104 double xa = fabs (x);
38105 if (!isless (xa, TWO52))
38106 return x;
38107 xa = (double)(long)(xa + nextafter (0.5, 0.0));
38108 return copysign (xa, x);
38110 enum machine_mode mode = GET_MODE (operand0);
38111 rtx res, TWO52, xa, label, xi, half, mask;
38112 const struct real_format *fmt;
38113 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
38115 /* Temporary for holding the result, initialized to the input
38116 operand to ease control flow. */
38117 res = gen_reg_rtx (mode);
38118 emit_move_insn (res, operand1);
38120 TWO52 = ix86_gen_TWO52 (mode);
38121 xa = ix86_expand_sse_fabs (res, &mask);
38122 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38124 /* load nextafter (0.5, 0.0) */
38125 fmt = REAL_MODE_FORMAT (mode);
38126 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
38127 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
38129 /* xa = xa + 0.5 */
38130 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
38131 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
38133 /* xa = (double)(int64_t)xa */
38134 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
38135 expand_fix (xi, xa, 0);
38136 expand_float (xa, xi, 0);
38138 /* res = copysign (xa, operand1) */
38139 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
38141 emit_label (label);
38142 LABEL_NUSES (label) = 1;
38144 emit_move_insn (operand0, res);
38147 /* Expand SSE sequence for computing round
38148 from OP1 storing into OP0 using sse4 round insn. */
38149 void
38150 ix86_expand_round_sse4 (rtx op0, rtx op1)
38152 enum machine_mode mode = GET_MODE (op0);
38153 rtx e1, e2, res, half;
38154 const struct real_format *fmt;
38155 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
38156 rtx (*gen_copysign) (rtx, rtx, rtx);
38157 rtx (*gen_round) (rtx, rtx, rtx);
38159 switch (mode)
38161 case SFmode:
38162 gen_copysign = gen_copysignsf3;
38163 gen_round = gen_sse4_1_roundsf2;
38164 break;
38165 case DFmode:
38166 gen_copysign = gen_copysigndf3;
38167 gen_round = gen_sse4_1_rounddf2;
38168 break;
38169 default:
38170 gcc_unreachable ();
38173 /* round (a) = trunc (a + copysign (0.5, a)) */
38175 /* load nextafter (0.5, 0.0) */
38176 fmt = REAL_MODE_FORMAT (mode);
38177 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
38178 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
38179 half = const_double_from_real_value (pred_half, mode);
38181 /* e1 = copysign (0.5, op1) */
38182 e1 = gen_reg_rtx (mode);
38183 emit_insn (gen_copysign (e1, half, op1));
38185 /* e2 = op1 + e1 */
38186 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
38188 /* res = trunc (e2) */
38189 res = gen_reg_rtx (mode);
38190 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
38192 emit_move_insn (op0, res);
38196 /* Table of valid machine attributes. */
38197 static const struct attribute_spec ix86_attribute_table[] =
38199 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
38200 affects_type_identity } */
38201 /* Stdcall attribute says callee is responsible for popping arguments
38202 if they are not variable. */
38203 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38204 true },
38205 /* Fastcall attribute says callee is responsible for popping arguments
38206 if they are not variable. */
38207 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38208 true },
38209 /* Thiscall attribute says callee is responsible for popping arguments
38210 if they are not variable. */
38211 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38212 true },
38213 /* Cdecl attribute says the callee is a normal C declaration */
38214 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38215 true },
38216 /* Regparm attribute specifies how many integer arguments are to be
38217 passed in registers. */
38218 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
38219 true },
38220 /* Sseregparm attribute says we are using x86_64 calling conventions
38221 for FP arguments. */
38222 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38223 true },
38224 /* The transactional memory builtins are implicitly regparm or fastcall
38225 depending on the ABI. Override the generic do-nothing attribute that
38226 these builtins were declared with. */
38227 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
38228 true },
38229 /* force_align_arg_pointer says this function realigns the stack at entry. */
38230 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
38231 false, true, true, ix86_handle_cconv_attribute, false },
38232 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
38233 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
38234 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
38235 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
38236 false },
38237 #endif
38238 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
38239 false },
38240 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
38241 false },
38242 #ifdef SUBTARGET_ATTRIBUTE_TABLE
38243 SUBTARGET_ATTRIBUTE_TABLE,
38244 #endif
38245 /* ms_abi and sysv_abi calling convention function attributes. */
38246 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
38247 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
38248 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
38249 false },
38250 { "callee_pop_aggregate_return", 1, 1, false, true, true,
38251 ix86_handle_callee_pop_aggregate_return, true },
38252 /* End element. */
38253 { NULL, 0, 0, false, false, false, NULL, false }
38256 /* Implement targetm.vectorize.builtin_vectorization_cost. */
38257 static int
38258 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
38259 tree vectype,
38260 int misalign ATTRIBUTE_UNUSED)
38262 unsigned elements;
38264 switch (type_of_cost)
38266 case scalar_stmt:
38267 return ix86_cost->scalar_stmt_cost;
38269 case scalar_load:
38270 return ix86_cost->scalar_load_cost;
38272 case scalar_store:
38273 return ix86_cost->scalar_store_cost;
38275 case vector_stmt:
38276 return ix86_cost->vec_stmt_cost;
38278 case vector_load:
38279 return ix86_cost->vec_align_load_cost;
38281 case vector_store:
38282 return ix86_cost->vec_store_cost;
38284 case vec_to_scalar:
38285 return ix86_cost->vec_to_scalar_cost;
38287 case scalar_to_vec:
38288 return ix86_cost->scalar_to_vec_cost;
38290 case unaligned_load:
38291 case unaligned_store:
38292 return ix86_cost->vec_unalign_load_cost;
38294 case cond_branch_taken:
38295 return ix86_cost->cond_taken_branch_cost;
38297 case cond_branch_not_taken:
38298 return ix86_cost->cond_not_taken_branch_cost;
38300 case vec_perm:
38301 case vec_promote_demote:
38302 return ix86_cost->vec_stmt_cost;
38304 case vec_construct:
38305 elements = TYPE_VECTOR_SUBPARTS (vectype);
38306 return elements / 2 + 1;
38308 default:
38309 gcc_unreachable ();
38313 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
38314 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
38315 insn every time. */
38317 static GTY(()) rtx vselect_insn;
38319 /* Initialize vselect_insn. */
38321 static void
38322 init_vselect_insn (void)
38324 unsigned i;
38325 rtx x;
38327 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
38328 for (i = 0; i < MAX_VECT_LEN; ++i)
38329 XVECEXP (x, 0, i) = const0_rtx;
38330 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
38331 const0_rtx), x);
38332 x = gen_rtx_SET (VOIDmode, const0_rtx, x);
38333 start_sequence ();
38334 vselect_insn = emit_insn (x);
38335 end_sequence ();
38338 /* Construct (set target (vec_select op0 (parallel perm))) and
38339 return true if that's a valid instruction in the active ISA. */
38341 static bool
38342 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
38343 unsigned nelt, bool testing_p)
38345 unsigned int i;
38346 rtx x, save_vconcat;
38347 int icode;
38349 if (vselect_insn == NULL_RTX)
38350 init_vselect_insn ();
38352 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
38353 PUT_NUM_ELEM (XVEC (x, 0), nelt);
38354 for (i = 0; i < nelt; ++i)
38355 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
38356 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
38357 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
38358 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
38359 SET_DEST (PATTERN (vselect_insn)) = target;
38360 icode = recog_memoized (vselect_insn);
38362 if (icode >= 0 && !testing_p)
38363 emit_insn (copy_rtx (PATTERN (vselect_insn)));
38365 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
38366 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
38367 INSN_CODE (vselect_insn) = -1;
38369 return icode >= 0;
38372 /* Similar, but generate a vec_concat from op0 and op1 as well. */
38374 static bool
38375 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
38376 const unsigned char *perm, unsigned nelt,
38377 bool testing_p)
38379 enum machine_mode v2mode;
38380 rtx x;
38381 bool ok;
38383 if (vselect_insn == NULL_RTX)
38384 init_vselect_insn ();
38386 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
38387 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
38388 PUT_MODE (x, v2mode);
38389 XEXP (x, 0) = op0;
38390 XEXP (x, 1) = op1;
38391 ok = expand_vselect (target, x, perm, nelt, testing_p);
38392 XEXP (x, 0) = const0_rtx;
38393 XEXP (x, 1) = const0_rtx;
38394 return ok;
38397 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
38398 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
38400 static bool
38401 expand_vec_perm_blend (struct expand_vec_perm_d *d)
38403 enum machine_mode vmode = d->vmode;
38404 unsigned i, mask, nelt = d->nelt;
38405 rtx target, op0, op1, x;
38406 rtx rperm[32], vperm;
38408 if (d->one_operand_p)
38409 return false;
38410 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
38412 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
38414 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
38416 else
38417 return false;
38419 /* This is a blend, not a permute. Elements must stay in their
38420 respective lanes. */
38421 for (i = 0; i < nelt; ++i)
38423 unsigned e = d->perm[i];
38424 if (!(e == i || e == i + nelt))
38425 return false;
38428 if (d->testing_p)
38429 return true;
38431 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
38432 decision should be extracted elsewhere, so that we only try that
38433 sequence once all budget==3 options have been tried. */
38434 target = d->target;
38435 op0 = d->op0;
38436 op1 = d->op1;
38437 mask = 0;
38439 switch (vmode)
38441 case V4DFmode:
38442 case V8SFmode:
38443 case V2DFmode:
38444 case V4SFmode:
38445 case V8HImode:
38446 case V8SImode:
38447 for (i = 0; i < nelt; ++i)
38448 mask |= (d->perm[i] >= nelt) << i;
38449 break;
38451 case V2DImode:
38452 for (i = 0; i < 2; ++i)
38453 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
38454 vmode = V8HImode;
38455 goto do_subreg;
38457 case V4SImode:
38458 for (i = 0; i < 4; ++i)
38459 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
38460 vmode = V8HImode;
38461 goto do_subreg;
38463 case V16QImode:
38464 /* See if bytes move in pairs so we can use pblendw with
38465 an immediate argument, rather than pblendvb with a vector
38466 argument. */
38467 for (i = 0; i < 16; i += 2)
38468 if (d->perm[i] + 1 != d->perm[i + 1])
38470 use_pblendvb:
38471 for (i = 0; i < nelt; ++i)
38472 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
38474 finish_pblendvb:
38475 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
38476 vperm = force_reg (vmode, vperm);
38478 if (GET_MODE_SIZE (vmode) == 16)
38479 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
38480 else
38481 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
38482 return true;
38485 for (i = 0; i < 8; ++i)
38486 mask |= (d->perm[i * 2] >= 16) << i;
38487 vmode = V8HImode;
38488 /* FALLTHRU */
38490 do_subreg:
38491 target = gen_lowpart (vmode, target);
38492 op0 = gen_lowpart (vmode, op0);
38493 op1 = gen_lowpart (vmode, op1);
38494 break;
38496 case V32QImode:
38497 /* See if bytes move in pairs. If not, vpblendvb must be used. */
38498 for (i = 0; i < 32; i += 2)
38499 if (d->perm[i] + 1 != d->perm[i + 1])
38500 goto use_pblendvb;
38501 /* See if bytes move in quadruplets. If yes, vpblendd
38502 with immediate can be used. */
38503 for (i = 0; i < 32; i += 4)
38504 if (d->perm[i] + 2 != d->perm[i + 2])
38505 break;
38506 if (i < 32)
38508 /* See if bytes move the same in both lanes. If yes,
38509 vpblendw with immediate can be used. */
38510 for (i = 0; i < 16; i += 2)
38511 if (d->perm[i] + 16 != d->perm[i + 16])
38512 goto use_pblendvb;
38514 /* Use vpblendw. */
38515 for (i = 0; i < 16; ++i)
38516 mask |= (d->perm[i * 2] >= 32) << i;
38517 vmode = V16HImode;
38518 goto do_subreg;
38521 /* Use vpblendd. */
38522 for (i = 0; i < 8; ++i)
38523 mask |= (d->perm[i * 4] >= 32) << i;
38524 vmode = V8SImode;
38525 goto do_subreg;
38527 case V16HImode:
38528 /* See if words move in pairs. If yes, vpblendd can be used. */
38529 for (i = 0; i < 16; i += 2)
38530 if (d->perm[i] + 1 != d->perm[i + 1])
38531 break;
38532 if (i < 16)
38534 /* See if words move the same in both lanes. If not,
38535 vpblendvb must be used. */
38536 for (i = 0; i < 8; i++)
38537 if (d->perm[i] + 8 != d->perm[i + 8])
38539 /* Use vpblendvb. */
38540 for (i = 0; i < 32; ++i)
38541 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
38543 vmode = V32QImode;
38544 nelt = 32;
38545 target = gen_lowpart (vmode, target);
38546 op0 = gen_lowpart (vmode, op0);
38547 op1 = gen_lowpart (vmode, op1);
38548 goto finish_pblendvb;
38551 /* Use vpblendw. */
38552 for (i = 0; i < 16; ++i)
38553 mask |= (d->perm[i] >= 16) << i;
38554 break;
38557 /* Use vpblendd. */
38558 for (i = 0; i < 8; ++i)
38559 mask |= (d->perm[i * 2] >= 16) << i;
38560 vmode = V8SImode;
38561 goto do_subreg;
38563 case V4DImode:
38564 /* Use vpblendd. */
38565 for (i = 0; i < 4; ++i)
38566 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
38567 vmode = V8SImode;
38568 goto do_subreg;
38570 default:
38571 gcc_unreachable ();
38574 /* This matches five different patterns with the different modes. */
38575 x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
38576 x = gen_rtx_SET (VOIDmode, target, x);
38577 emit_insn (x);
38579 return true;
38582 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
38583 in terms of the variable form of vpermilps.
38585 Note that we will have already failed the immediate input vpermilps,
38586 which requires that the high and low part shuffle be identical; the
38587 variable form doesn't require that. */
38589 static bool
38590 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
38592 rtx rperm[8], vperm;
38593 unsigned i;
38595 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
38596 return false;
38598 /* We can only permute within the 128-bit lane. */
38599 for (i = 0; i < 8; ++i)
38601 unsigned e = d->perm[i];
38602 if (i < 4 ? e >= 4 : e < 4)
38603 return false;
38606 if (d->testing_p)
38607 return true;
38609 for (i = 0; i < 8; ++i)
38611 unsigned e = d->perm[i];
38613 /* Within each 128-bit lane, the elements of op0 are numbered
38614 from 0 and the elements of op1 are numbered from 4. */
38615 if (e >= 8 + 4)
38616 e -= 8;
38617 else if (e >= 4)
38618 e -= 4;
38620 rperm[i] = GEN_INT (e);
38623 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
38624 vperm = force_reg (V8SImode, vperm);
38625 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
38627 return true;
38630 /* Return true if permutation D can be performed as VMODE permutation
38631 instead. */
38633 static bool
38634 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
38636 unsigned int i, j, chunk;
38638 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
38639 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
38640 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
38641 return false;
38643 if (GET_MODE_NUNITS (vmode) >= d->nelt)
38644 return true;
38646 chunk = d->nelt / GET_MODE_NUNITS (vmode);
38647 for (i = 0; i < d->nelt; i += chunk)
38648 if (d->perm[i] & (chunk - 1))
38649 return false;
38650 else
38651 for (j = 1; j < chunk; ++j)
38652 if (d->perm[i] + j != d->perm[i + j])
38653 return false;
38655 return true;
38658 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
38659 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
38661 static bool
38662 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
38664 unsigned i, nelt, eltsz, mask;
38665 unsigned char perm[32];
38666 enum machine_mode vmode = V16QImode;
38667 rtx rperm[32], vperm, target, op0, op1;
38669 nelt = d->nelt;
38671 if (!d->one_operand_p)
38673 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
38675 if (TARGET_AVX2
38676 && valid_perm_using_mode_p (V2TImode, d))
38678 if (d->testing_p)
38679 return true;
38681 /* Use vperm2i128 insn. The pattern uses
38682 V4DImode instead of V2TImode. */
38683 target = gen_lowpart (V4DImode, d->target);
38684 op0 = gen_lowpart (V4DImode, d->op0);
38685 op1 = gen_lowpart (V4DImode, d->op1);
38686 rperm[0]
38687 = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
38688 || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
38689 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
38690 return true;
38692 return false;
38695 else
38697 if (GET_MODE_SIZE (d->vmode) == 16)
38699 if (!TARGET_SSSE3)
38700 return false;
38702 else if (GET_MODE_SIZE (d->vmode) == 32)
38704 if (!TARGET_AVX2)
38705 return false;
38707 /* V4DImode should be already handled through
38708 expand_vselect by vpermq instruction. */
38709 gcc_assert (d->vmode != V4DImode);
38711 vmode = V32QImode;
38712 if (d->vmode == V8SImode
38713 || d->vmode == V16HImode
38714 || d->vmode == V32QImode)
38716 /* First see if vpermq can be used for
38717 V8SImode/V16HImode/V32QImode. */
38718 if (valid_perm_using_mode_p (V4DImode, d))
38720 for (i = 0; i < 4; i++)
38721 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
38722 if (d->testing_p)
38723 return true;
38724 return expand_vselect (gen_lowpart (V4DImode, d->target),
38725 gen_lowpart (V4DImode, d->op0),
38726 perm, 4, false);
38729 /* Next see if vpermd can be used. */
38730 if (valid_perm_using_mode_p (V8SImode, d))
38731 vmode = V8SImode;
38733 /* Or if vpermps can be used. */
38734 else if (d->vmode == V8SFmode)
38735 vmode = V8SImode;
38737 if (vmode == V32QImode)
38739 /* vpshufb only works intra lanes, it is not
38740 possible to shuffle bytes in between the lanes. */
38741 for (i = 0; i < nelt; ++i)
38742 if ((d->perm[i] ^ i) & (nelt / 2))
38743 return false;
38746 else
38747 return false;
38750 if (d->testing_p)
38751 return true;
38753 if (vmode == V8SImode)
38754 for (i = 0; i < 8; ++i)
38755 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
38756 else
38758 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
38759 if (!d->one_operand_p)
38760 mask = 2 * nelt - 1;
38761 else if (vmode == V16QImode)
38762 mask = nelt - 1;
38763 else
38764 mask = nelt / 2 - 1;
38766 for (i = 0; i < nelt; ++i)
38768 unsigned j, e = d->perm[i] & mask;
38769 for (j = 0; j < eltsz; ++j)
38770 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
38774 vperm = gen_rtx_CONST_VECTOR (vmode,
38775 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
38776 vperm = force_reg (vmode, vperm);
38778 target = gen_lowpart (vmode, d->target);
38779 op0 = gen_lowpart (vmode, d->op0);
38780 if (d->one_operand_p)
38782 if (vmode == V16QImode)
38783 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
38784 else if (vmode == V32QImode)
38785 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
38786 else if (vmode == V8SFmode)
38787 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
38788 else
38789 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
38791 else
38793 op1 = gen_lowpart (vmode, d->op1);
38794 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
38797 return true;
38800 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
38801 in a single instruction. */
38803 static bool
38804 expand_vec_perm_1 (struct expand_vec_perm_d *d)
38806 unsigned i, nelt = d->nelt;
38807 unsigned char perm2[MAX_VECT_LEN];
38809 /* Check plain VEC_SELECT first, because AVX has instructions that could
38810 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
38811 input where SEL+CONCAT may not. */
38812 if (d->one_operand_p)
38814 int mask = nelt - 1;
38815 bool identity_perm = true;
38816 bool broadcast_perm = true;
38818 for (i = 0; i < nelt; i++)
38820 perm2[i] = d->perm[i] & mask;
38821 if (perm2[i] != i)
38822 identity_perm = false;
38823 if (perm2[i])
38824 broadcast_perm = false;
38827 if (identity_perm)
38829 if (!d->testing_p)
38830 emit_move_insn (d->target, d->op0);
38831 return true;
38833 else if (broadcast_perm && TARGET_AVX2)
38835 /* Use vpbroadcast{b,w,d}. */
38836 rtx (*gen) (rtx, rtx) = NULL;
38837 switch (d->vmode)
38839 case V32QImode:
38840 gen = gen_avx2_pbroadcastv32qi_1;
38841 break;
38842 case V16HImode:
38843 gen = gen_avx2_pbroadcastv16hi_1;
38844 break;
38845 case V8SImode:
38846 gen = gen_avx2_pbroadcastv8si_1;
38847 break;
38848 case V16QImode:
38849 gen = gen_avx2_pbroadcastv16qi;
38850 break;
38851 case V8HImode:
38852 gen = gen_avx2_pbroadcastv8hi;
38853 break;
38854 case V8SFmode:
38855 gen = gen_avx2_vec_dupv8sf_1;
38856 break;
38857 /* For other modes prefer other shuffles this function creates. */
38858 default: break;
38860 if (gen != NULL)
38862 if (!d->testing_p)
38863 emit_insn (gen (d->target, d->op0));
38864 return true;
38868 if (expand_vselect (d->target, d->op0, perm2, nelt, d->testing_p))
38869 return true;
38871 /* There are plenty of patterns in sse.md that are written for
38872 SEL+CONCAT and are not replicated for a single op. Perhaps
38873 that should be changed, to avoid the nastiness here. */
38875 /* Recognize interleave style patterns, which means incrementing
38876 every other permutation operand. */
38877 for (i = 0; i < nelt; i += 2)
38879 perm2[i] = d->perm[i] & mask;
38880 perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
38882 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
38883 d->testing_p))
38884 return true;
38886 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
38887 if (nelt >= 4)
38889 for (i = 0; i < nelt; i += 4)
38891 perm2[i + 0] = d->perm[i + 0] & mask;
38892 perm2[i + 1] = d->perm[i + 1] & mask;
38893 perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
38894 perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
38897 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
38898 d->testing_p))
38899 return true;
38903 /* Finally, try the fully general two operand permute. */
38904 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
38905 d->testing_p))
38906 return true;
38908 /* Recognize interleave style patterns with reversed operands. */
38909 if (!d->one_operand_p)
38911 for (i = 0; i < nelt; ++i)
38913 unsigned e = d->perm[i];
38914 if (e >= nelt)
38915 e -= nelt;
38916 else
38917 e += nelt;
38918 perm2[i] = e;
38921 if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt,
38922 d->testing_p))
38923 return true;
38926 /* Try the SSE4.1 blend variable merge instructions. */
38927 if (expand_vec_perm_blend (d))
38928 return true;
38930 /* Try one of the AVX vpermil variable permutations. */
38931 if (expand_vec_perm_vpermil (d))
38932 return true;
38934 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
38935 vpshufb, vpermd, vpermps or vpermq variable permutation. */
38936 if (expand_vec_perm_pshufb (d))
38937 return true;
38939 return false;
38942 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
38943 in terms of a pair of pshuflw + pshufhw instructions. */
38945 static bool
38946 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
38948 unsigned char perm2[MAX_VECT_LEN];
38949 unsigned i;
38950 bool ok;
38952 if (d->vmode != V8HImode || !d->one_operand_p)
38953 return false;
38955 /* The two permutations only operate in 64-bit lanes. */
38956 for (i = 0; i < 4; ++i)
38957 if (d->perm[i] >= 4)
38958 return false;
38959 for (i = 4; i < 8; ++i)
38960 if (d->perm[i] < 4)
38961 return false;
38963 if (d->testing_p)
38964 return true;
38966 /* Emit the pshuflw. */
38967 memcpy (perm2, d->perm, 4);
38968 for (i = 4; i < 8; ++i)
38969 perm2[i] = i;
38970 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
38971 gcc_assert (ok);
38973 /* Emit the pshufhw. */
38974 memcpy (perm2 + 4, d->perm + 4, 4);
38975 for (i = 0; i < 4; ++i)
38976 perm2[i] = i;
38977 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
38978 gcc_assert (ok);
38980 return true;
38983 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
38984 the permutation using the SSSE3 palignr instruction. This succeeds
38985 when all of the elements in PERM fit within one vector and we merely
38986 need to shift them down so that a single vector permutation has a
38987 chance to succeed. */
38989 static bool
38990 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
38992 unsigned i, nelt = d->nelt;
38993 unsigned min, max;
38994 bool in_order, ok;
38995 rtx shift;
38997 /* Even with AVX, palignr only operates on 128-bit vectors. */
38998 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
38999 return false;
39001 min = nelt, max = 0;
39002 for (i = 0; i < nelt; ++i)
39004 unsigned e = d->perm[i];
39005 if (e < min)
39006 min = e;
39007 if (e > max)
39008 max = e;
39010 if (min == 0 || max - min >= nelt)
39011 return false;
39013 /* Given that we have SSSE3, we know we'll be able to implement the
39014 single operand permutation after the palignr with pshufb. */
39015 if (d->testing_p)
39016 return true;
39018 shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
39019 emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target),
39020 gen_lowpart (TImode, d->op1),
39021 gen_lowpart (TImode, d->op0), shift));
39023 d->op0 = d->op1 = d->target;
39024 d->one_operand_p = true;
39026 in_order = true;
39027 for (i = 0; i < nelt; ++i)
39029 unsigned e = d->perm[i] - min;
39030 if (e != i)
39031 in_order = false;
39032 d->perm[i] = e;
39035 /* Test for the degenerate case where the alignment by itself
39036 produces the desired permutation. */
39037 if (in_order)
39038 return true;
39040 ok = expand_vec_perm_1 (d);
39041 gcc_assert (ok);
39043 return ok;
39046 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
39048 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
39049 a two vector permutation into a single vector permutation by using
39050 an interleave operation to merge the vectors. */
39052 static bool
39053 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
39055 struct expand_vec_perm_d dremap, dfinal;
39056 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
39057 unsigned HOST_WIDE_INT contents;
39058 unsigned char remap[2 * MAX_VECT_LEN];
39059 rtx seq;
39060 bool ok, same_halves = false;
39062 if (GET_MODE_SIZE (d->vmode) == 16)
39064 if (d->one_operand_p)
39065 return false;
39067 else if (GET_MODE_SIZE (d->vmode) == 32)
39069 if (!TARGET_AVX)
39070 return false;
39071 /* For 32-byte modes allow even d->one_operand_p.
39072 The lack of cross-lane shuffling in some instructions
39073 might prevent a single insn shuffle. */
39074 dfinal = *d;
39075 dfinal.testing_p = true;
39076 /* If expand_vec_perm_interleave3 can expand this into
39077 a 3 insn sequence, give up and let it be expanded as
39078 3 insn sequence. While that is one insn longer,
39079 it doesn't need a memory operand and in the common
39080 case that both interleave low and high permutations
39081 with the same operands are adjacent needs 4 insns
39082 for both after CSE. */
39083 if (expand_vec_perm_interleave3 (&dfinal))
39084 return false;
39086 else
39087 return false;
39089 /* Examine from whence the elements come. */
39090 contents = 0;
39091 for (i = 0; i < nelt; ++i)
39092 contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
39094 memset (remap, 0xff, sizeof (remap));
39095 dremap = *d;
39097 if (GET_MODE_SIZE (d->vmode) == 16)
39099 unsigned HOST_WIDE_INT h1, h2, h3, h4;
39101 /* Split the two input vectors into 4 halves. */
39102 h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
39103 h2 = h1 << nelt2;
39104 h3 = h2 << nelt2;
39105 h4 = h3 << nelt2;
39107 /* If the elements from the low halves use interleave low, and similarly
39108 for interleave high. If the elements are from mis-matched halves, we
39109 can use shufps for V4SF/V4SI or do a DImode shuffle. */
39110 if ((contents & (h1 | h3)) == contents)
39112 /* punpckl* */
39113 for (i = 0; i < nelt2; ++i)
39115 remap[i] = i * 2;
39116 remap[i + nelt] = i * 2 + 1;
39117 dremap.perm[i * 2] = i;
39118 dremap.perm[i * 2 + 1] = i + nelt;
39120 if (!TARGET_SSE2 && d->vmode == V4SImode)
39121 dremap.vmode = V4SFmode;
39123 else if ((contents & (h2 | h4)) == contents)
39125 /* punpckh* */
39126 for (i = 0; i < nelt2; ++i)
39128 remap[i + nelt2] = i * 2;
39129 remap[i + nelt + nelt2] = i * 2 + 1;
39130 dremap.perm[i * 2] = i + nelt2;
39131 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
39133 if (!TARGET_SSE2 && d->vmode == V4SImode)
39134 dremap.vmode = V4SFmode;
39136 else if ((contents & (h1 | h4)) == contents)
39138 /* shufps */
39139 for (i = 0; i < nelt2; ++i)
39141 remap[i] = i;
39142 remap[i + nelt + nelt2] = i + nelt2;
39143 dremap.perm[i] = i;
39144 dremap.perm[i + nelt2] = i + nelt + nelt2;
39146 if (nelt != 4)
39148 /* shufpd */
39149 dremap.vmode = V2DImode;
39150 dremap.nelt = 2;
39151 dremap.perm[0] = 0;
39152 dremap.perm[1] = 3;
39155 else if ((contents & (h2 | h3)) == contents)
39157 /* shufps */
39158 for (i = 0; i < nelt2; ++i)
39160 remap[i + nelt2] = i;
39161 remap[i + nelt] = i + nelt2;
39162 dremap.perm[i] = i + nelt2;
39163 dremap.perm[i + nelt2] = i + nelt;
39165 if (nelt != 4)
39167 /* shufpd */
39168 dremap.vmode = V2DImode;
39169 dremap.nelt = 2;
39170 dremap.perm[0] = 1;
39171 dremap.perm[1] = 2;
39174 else
39175 return false;
39177 else
39179 unsigned int nelt4 = nelt / 4, nzcnt = 0;
39180 unsigned HOST_WIDE_INT q[8];
39181 unsigned int nonzero_halves[4];
39183 /* Split the two input vectors into 8 quarters. */
39184 q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
39185 for (i = 1; i < 8; ++i)
39186 q[i] = q[0] << (nelt4 * i);
39187 for (i = 0; i < 4; ++i)
39188 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
39190 nonzero_halves[nzcnt] = i;
39191 ++nzcnt;
39194 if (nzcnt == 1)
39196 gcc_assert (d->one_operand_p);
39197 nonzero_halves[1] = nonzero_halves[0];
39198 same_halves = true;
39200 else if (d->one_operand_p)
39202 gcc_assert (nonzero_halves[0] == 0);
39203 gcc_assert (nonzero_halves[1] == 1);
39206 if (nzcnt <= 2)
39208 if (d->perm[0] / nelt2 == nonzero_halves[1])
39210 /* Attempt to increase the likelihood that dfinal
39211 shuffle will be intra-lane. */
39212 char tmph = nonzero_halves[0];
39213 nonzero_halves[0] = nonzero_halves[1];
39214 nonzero_halves[1] = tmph;
39217 /* vperm2f128 or vperm2i128. */
39218 for (i = 0; i < nelt2; ++i)
39220 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
39221 remap[i + nonzero_halves[0] * nelt2] = i;
39222 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
39223 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
39226 if (d->vmode != V8SFmode
39227 && d->vmode != V4DFmode
39228 && d->vmode != V8SImode)
39230 dremap.vmode = V8SImode;
39231 dremap.nelt = 8;
39232 for (i = 0; i < 4; ++i)
39234 dremap.perm[i] = i + nonzero_halves[0] * 4;
39235 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
39239 else if (d->one_operand_p)
39240 return false;
39241 else if (TARGET_AVX2
39242 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
39244 /* vpunpckl* */
39245 for (i = 0; i < nelt4; ++i)
39247 remap[i] = i * 2;
39248 remap[i + nelt] = i * 2 + 1;
39249 remap[i + nelt2] = i * 2 + nelt2;
39250 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
39251 dremap.perm[i * 2] = i;
39252 dremap.perm[i * 2 + 1] = i + nelt;
39253 dremap.perm[i * 2 + nelt2] = i + nelt2;
39254 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
39257 else if (TARGET_AVX2
39258 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
39260 /* vpunpckh* */
39261 for (i = 0; i < nelt4; ++i)
39263 remap[i + nelt4] = i * 2;
39264 remap[i + nelt + nelt4] = i * 2 + 1;
39265 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
39266 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
39267 dremap.perm[i * 2] = i + nelt4;
39268 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
39269 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
39270 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
39273 else
39274 return false;
39277 /* Use the remapping array set up above to move the elements from their
39278 swizzled locations into their final destinations. */
39279 dfinal = *d;
39280 for (i = 0; i < nelt; ++i)
39282 unsigned e = remap[d->perm[i]];
39283 gcc_assert (e < nelt);
39284 /* If same_halves is true, both halves of the remapped vector are the
39285 same. Avoid cross-lane accesses if possible. */
39286 if (same_halves && i >= nelt2)
39288 gcc_assert (e < nelt2);
39289 dfinal.perm[i] = e + nelt2;
39291 else
39292 dfinal.perm[i] = e;
39294 dfinal.op0 = gen_reg_rtx (dfinal.vmode);
39295 dfinal.op1 = dfinal.op0;
39296 dfinal.one_operand_p = true;
39297 dremap.target = dfinal.op0;
39299 /* Test if the final remap can be done with a single insn. For V4SFmode or
39300 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
39301 start_sequence ();
39302 ok = expand_vec_perm_1 (&dfinal);
39303 seq = get_insns ();
39304 end_sequence ();
39306 if (!ok)
39307 return false;
39309 if (d->testing_p)
39310 return true;
39312 if (dremap.vmode != dfinal.vmode)
39314 dremap.target = gen_lowpart (dremap.vmode, dremap.target);
39315 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
39316 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
39319 ok = expand_vec_perm_1 (&dremap);
39320 gcc_assert (ok);
39322 emit_insn (seq);
39323 return true;
39326 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
39327 a single vector cross-lane permutation into vpermq followed
39328 by any of the single insn permutations. */
39330 static bool
39331 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
39333 struct expand_vec_perm_d dremap, dfinal;
39334 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
39335 unsigned contents[2];
39336 bool ok;
39338 if (!(TARGET_AVX2
39339 && (d->vmode == V32QImode || d->vmode == V16HImode)
39340 && d->one_operand_p))
39341 return false;
39343 contents[0] = 0;
39344 contents[1] = 0;
39345 for (i = 0; i < nelt2; ++i)
39347 contents[0] |= 1u << (d->perm[i] / nelt4);
39348 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
39351 for (i = 0; i < 2; ++i)
39353 unsigned int cnt = 0;
39354 for (j = 0; j < 4; ++j)
39355 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
39356 return false;
39359 if (d->testing_p)
39360 return true;
39362 dremap = *d;
39363 dremap.vmode = V4DImode;
39364 dremap.nelt = 4;
39365 dremap.target = gen_reg_rtx (V4DImode);
39366 dremap.op0 = gen_lowpart (V4DImode, d->op0);
39367 dremap.op1 = dremap.op0;
39368 dremap.one_operand_p = true;
39369 for (i = 0; i < 2; ++i)
39371 unsigned int cnt = 0;
39372 for (j = 0; j < 4; ++j)
39373 if ((contents[i] & (1u << j)) != 0)
39374 dremap.perm[2 * i + cnt++] = j;
39375 for (; cnt < 2; ++cnt)
39376 dremap.perm[2 * i + cnt] = 0;
39379 dfinal = *d;
39380 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
39381 dfinal.op1 = dfinal.op0;
39382 dfinal.one_operand_p = true;
39383 for (i = 0, j = 0; i < nelt; ++i)
39385 if (i == nelt2)
39386 j = 2;
39387 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
39388 if ((d->perm[i] / nelt4) == dremap.perm[j])
39390 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
39391 dfinal.perm[i] |= nelt4;
39392 else
39393 gcc_unreachable ();
39396 ok = expand_vec_perm_1 (&dremap);
39397 gcc_assert (ok);
39399 ok = expand_vec_perm_1 (&dfinal);
39400 gcc_assert (ok);
39402 return true;
39405 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
39406 a vector permutation using two instructions, vperm2f128 resp.
39407 vperm2i128 followed by any single in-lane permutation. */
39409 static bool
39410 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
39412 struct expand_vec_perm_d dfirst, dsecond;
39413 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
39414 bool ok;
39416 if (!TARGET_AVX
39417 || GET_MODE_SIZE (d->vmode) != 32
39418 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
39419 return false;
39421 dsecond = *d;
39422 dsecond.one_operand_p = false;
39423 dsecond.testing_p = true;
39425 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
39426 immediate. For perm < 16 the second permutation uses
39427 d->op0 as first operand, for perm >= 16 it uses d->op1
39428 as first operand. The second operand is the result of
39429 vperm2[fi]128. */
39430 for (perm = 0; perm < 32; perm++)
39432 /* Ignore permutations which do not move anything cross-lane. */
39433 if (perm < 16)
39435 /* The second shuffle for e.g. V4DFmode has
39436 0123 and ABCD operands.
39437 Ignore AB23, as 23 is already in the second lane
39438 of the first operand. */
39439 if ((perm & 0xc) == (1 << 2)) continue;
39440 /* And 01CD, as 01 is in the first lane of the first
39441 operand. */
39442 if ((perm & 3) == 0) continue;
39443 /* And 4567, as then the vperm2[fi]128 doesn't change
39444 anything on the original 4567 second operand. */
39445 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
39447 else
39449 /* The second shuffle for e.g. V4DFmode has
39450 4567 and ABCD operands.
39451 Ignore AB67, as 67 is already in the second lane
39452 of the first operand. */
39453 if ((perm & 0xc) == (3 << 2)) continue;
39454 /* And 45CD, as 45 is in the first lane of the first
39455 operand. */
39456 if ((perm & 3) == 2) continue;
39457 /* And 0123, as then the vperm2[fi]128 doesn't change
39458 anything on the original 0123 first operand. */
39459 if ((perm & 0xf) == (1 << 2)) continue;
39462 for (i = 0; i < nelt; i++)
39464 j = d->perm[i] / nelt2;
39465 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
39466 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
39467 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
39468 dsecond.perm[i] = d->perm[i] & (nelt - 1);
39469 else
39470 break;
39473 if (i == nelt)
39475 start_sequence ();
39476 ok = expand_vec_perm_1 (&dsecond);
39477 end_sequence ();
39479 else
39480 ok = false;
39482 if (ok)
39484 if (d->testing_p)
39485 return true;
39487 /* Found a usable second shuffle. dfirst will be
39488 vperm2f128 on d->op0 and d->op1. */
39489 dsecond.testing_p = false;
39490 dfirst = *d;
39491 dfirst.target = gen_reg_rtx (d->vmode);
39492 for (i = 0; i < nelt; i++)
39493 dfirst.perm[i] = (i & (nelt2 - 1))
39494 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
39496 ok = expand_vec_perm_1 (&dfirst);
39497 gcc_assert (ok);
39499 /* And dsecond is some single insn shuffle, taking
39500 d->op0 and result of vperm2f128 (if perm < 16) or
39501 d->op1 and result of vperm2f128 (otherwise). */
39502 dsecond.op1 = dfirst.target;
39503 if (perm >= 16)
39504 dsecond.op0 = dfirst.op1;
39506 ok = expand_vec_perm_1 (&dsecond);
39507 gcc_assert (ok);
39509 return true;
39512 /* For one operand, the only useful vperm2f128 permutation is 0x10. */
39513 if (d->one_operand_p)
39514 return false;
39517 return false;
39520 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
39521 a two vector permutation using 2 intra-lane interleave insns
39522 and cross-lane shuffle for 32-byte vectors. */
39524 static bool
39525 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
39527 unsigned i, nelt;
39528 rtx (*gen) (rtx, rtx, rtx);
39530 if (d->one_operand_p)
39531 return false;
39532 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
39534 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
39536 else
39537 return false;
39539 nelt = d->nelt;
39540 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
39541 return false;
39542 for (i = 0; i < nelt; i += 2)
39543 if (d->perm[i] != d->perm[0] + i / 2
39544 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
39545 return false;
39547 if (d->testing_p)
39548 return true;
39550 switch (d->vmode)
39552 case V32QImode:
39553 if (d->perm[0])
39554 gen = gen_vec_interleave_highv32qi;
39555 else
39556 gen = gen_vec_interleave_lowv32qi;
39557 break;
39558 case V16HImode:
39559 if (d->perm[0])
39560 gen = gen_vec_interleave_highv16hi;
39561 else
39562 gen = gen_vec_interleave_lowv16hi;
39563 break;
39564 case V8SImode:
39565 if (d->perm[0])
39566 gen = gen_vec_interleave_highv8si;
39567 else
39568 gen = gen_vec_interleave_lowv8si;
39569 break;
39570 case V4DImode:
39571 if (d->perm[0])
39572 gen = gen_vec_interleave_highv4di;
39573 else
39574 gen = gen_vec_interleave_lowv4di;
39575 break;
39576 case V8SFmode:
39577 if (d->perm[0])
39578 gen = gen_vec_interleave_highv8sf;
39579 else
39580 gen = gen_vec_interleave_lowv8sf;
39581 break;
39582 case V4DFmode:
39583 if (d->perm[0])
39584 gen = gen_vec_interleave_highv4df;
39585 else
39586 gen = gen_vec_interleave_lowv4df;
39587 break;
39588 default:
39589 gcc_unreachable ();
39592 emit_insn (gen (d->target, d->op0, d->op1));
39593 return true;
39596 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
39597 a single vector permutation using a single intra-lane vector
39598 permutation, vperm2f128 swapping the lanes and vblend* insn blending
39599 the non-swapped and swapped vectors together. */
39601 static bool
39602 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
39604 struct expand_vec_perm_d dfirst, dsecond;
39605 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
39606 rtx seq;
39607 bool ok;
39608 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
39610 if (!TARGET_AVX
39611 || TARGET_AVX2
39612 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
39613 || !d->one_operand_p)
39614 return false;
39616 dfirst = *d;
39617 for (i = 0; i < nelt; i++)
39618 dfirst.perm[i] = 0xff;
39619 for (i = 0, msk = 0; i < nelt; i++)
39621 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
39622 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
39623 return false;
39624 dfirst.perm[j] = d->perm[i];
39625 if (j != i)
39626 msk |= (1 << i);
39628 for (i = 0; i < nelt; i++)
39629 if (dfirst.perm[i] == 0xff)
39630 dfirst.perm[i] = i;
39632 if (!d->testing_p)
39633 dfirst.target = gen_reg_rtx (dfirst.vmode);
39635 start_sequence ();
39636 ok = expand_vec_perm_1 (&dfirst);
39637 seq = get_insns ();
39638 end_sequence ();
39640 if (!ok)
39641 return false;
39643 if (d->testing_p)
39644 return true;
39646 emit_insn (seq);
39648 dsecond = *d;
39649 dsecond.op0 = dfirst.target;
39650 dsecond.op1 = dfirst.target;
39651 dsecond.one_operand_p = true;
39652 dsecond.target = gen_reg_rtx (dsecond.vmode);
39653 for (i = 0; i < nelt; i++)
39654 dsecond.perm[i] = i ^ nelt2;
39656 ok = expand_vec_perm_1 (&dsecond);
39657 gcc_assert (ok);
39659 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
39660 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
39661 return true;
39664 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
39665 permutation using two vperm2f128, followed by a vshufpd insn blending
39666 the two vectors together. */
39668 static bool
39669 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
39671 struct expand_vec_perm_d dfirst, dsecond, dthird;
39672 bool ok;
39674 if (!TARGET_AVX || (d->vmode != V4DFmode))
39675 return false;
39677 if (d->testing_p)
39678 return true;
39680 dfirst = *d;
39681 dsecond = *d;
39682 dthird = *d;
39684 dfirst.perm[0] = (d->perm[0] & ~1);
39685 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
39686 dfirst.perm[2] = (d->perm[2] & ~1);
39687 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
39688 dsecond.perm[0] = (d->perm[1] & ~1);
39689 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
39690 dsecond.perm[2] = (d->perm[3] & ~1);
39691 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
39692 dthird.perm[0] = (d->perm[0] % 2);
39693 dthird.perm[1] = (d->perm[1] % 2) + 4;
39694 dthird.perm[2] = (d->perm[2] % 2) + 2;
39695 dthird.perm[3] = (d->perm[3] % 2) + 6;
39697 dfirst.target = gen_reg_rtx (dfirst.vmode);
39698 dsecond.target = gen_reg_rtx (dsecond.vmode);
39699 dthird.op0 = dfirst.target;
39700 dthird.op1 = dsecond.target;
39701 dthird.one_operand_p = false;
39703 canonicalize_perm (&dfirst);
39704 canonicalize_perm (&dsecond);
39706 ok = expand_vec_perm_1 (&dfirst)
39707 && expand_vec_perm_1 (&dsecond)
39708 && expand_vec_perm_1 (&dthird);
39710 gcc_assert (ok);
39712 return true;
39715 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
39716 permutation with two pshufb insns and an ior. We should have already
39717 failed all two instruction sequences. */
39719 static bool
39720 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
39722 rtx rperm[2][16], vperm, l, h, op, m128;
39723 unsigned int i, nelt, eltsz;
39725 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
39726 return false;
39727 gcc_assert (!d->one_operand_p);
39729 nelt = d->nelt;
39730 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
39732 /* Generate two permutation masks. If the required element is within
39733 the given vector it is shuffled into the proper lane. If the required
39734 element is in the other vector, force a zero into the lane by setting
39735 bit 7 in the permutation mask. */
39736 m128 = GEN_INT (-128);
39737 for (i = 0; i < nelt; ++i)
39739 unsigned j, e = d->perm[i];
39740 unsigned which = (e >= nelt);
39741 if (e >= nelt)
39742 e -= nelt;
39744 for (j = 0; j < eltsz; ++j)
39746 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
39747 rperm[1-which][i*eltsz + j] = m128;
39751 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
39752 vperm = force_reg (V16QImode, vperm);
39754 l = gen_reg_rtx (V16QImode);
39755 op = gen_lowpart (V16QImode, d->op0);
39756 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
39758 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
39759 vperm = force_reg (V16QImode, vperm);
39761 h = gen_reg_rtx (V16QImode);
39762 op = gen_lowpart (V16QImode, d->op1);
39763 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
39765 op = gen_lowpart (V16QImode, d->target);
39766 emit_insn (gen_iorv16qi3 (op, l, h));
39768 return true;
39771 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
39772 with two vpshufb insns, vpermq and vpor. We should have already failed
39773 all two or three instruction sequences. */
39775 static bool
39776 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
39778 rtx rperm[2][32], vperm, l, h, hp, op, m128;
39779 unsigned int i, nelt, eltsz;
39781 if (!TARGET_AVX2
39782 || !d->one_operand_p
39783 || (d->vmode != V32QImode && d->vmode != V16HImode))
39784 return false;
39786 if (d->testing_p)
39787 return true;
39789 nelt = d->nelt;
39790 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
39792 /* Generate two permutation masks. If the required element is within
39793 the same lane, it is shuffled in. If the required element from the
39794 other lane, force a zero by setting bit 7 in the permutation mask.
39795 In the other mask the mask has non-negative elements if element
39796 is requested from the other lane, but also moved to the other lane,
39797 so that the result of vpshufb can have the two V2TImode halves
39798 swapped. */
39799 m128 = GEN_INT (-128);
39800 for (i = 0; i < nelt; ++i)
39802 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
39803 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
39805 for (j = 0; j < eltsz; ++j)
39807 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
39808 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
39812 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
39813 vperm = force_reg (V32QImode, vperm);
39815 h = gen_reg_rtx (V32QImode);
39816 op = gen_lowpart (V32QImode, d->op0);
39817 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
39819 /* Swap the 128-byte lanes of h into hp. */
39820 hp = gen_reg_rtx (V4DImode);
39821 op = gen_lowpart (V4DImode, h);
39822 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
39823 const1_rtx));
39825 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
39826 vperm = force_reg (V32QImode, vperm);
39828 l = gen_reg_rtx (V32QImode);
39829 op = gen_lowpart (V32QImode, d->op0);
39830 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
39832 op = gen_lowpart (V32QImode, d->target);
39833 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
39835 return true;
39838 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
39839 and extract-odd permutations of two V32QImode and V16QImode operand
39840 with two vpshufb insns, vpor and vpermq. We should have already
39841 failed all two or three instruction sequences. */
39843 static bool
39844 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
39846 rtx rperm[2][32], vperm, l, h, ior, op, m128;
39847 unsigned int i, nelt, eltsz;
39849 if (!TARGET_AVX2
39850 || d->one_operand_p
39851 || (d->vmode != V32QImode && d->vmode != V16HImode))
39852 return false;
39854 for (i = 0; i < d->nelt; ++i)
39855 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
39856 return false;
39858 if (d->testing_p)
39859 return true;
39861 nelt = d->nelt;
39862 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
39864 /* Generate two permutation masks. In the first permutation mask
39865 the first quarter will contain indexes for the first half
39866 of the op0, the second quarter will contain bit 7 set, third quarter
39867 will contain indexes for the second half of the op0 and the
39868 last quarter bit 7 set. In the second permutation mask
39869 the first quarter will contain bit 7 set, the second quarter
39870 indexes for the first half of the op1, the third quarter bit 7 set
39871 and last quarter indexes for the second half of the op1.
39872 I.e. the first mask e.g. for V32QImode extract even will be:
39873 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
39874 (all values masked with 0xf except for -128) and second mask
39875 for extract even will be
39876 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
39877 m128 = GEN_INT (-128);
39878 for (i = 0; i < nelt; ++i)
39880 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
39881 unsigned which = d->perm[i] >= nelt;
39882 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
39884 for (j = 0; j < eltsz; ++j)
39886 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
39887 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
39891 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
39892 vperm = force_reg (V32QImode, vperm);
39894 l = gen_reg_rtx (V32QImode);
39895 op = gen_lowpart (V32QImode, d->op0);
39896 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
39898 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
39899 vperm = force_reg (V32QImode, vperm);
39901 h = gen_reg_rtx (V32QImode);
39902 op = gen_lowpart (V32QImode, d->op1);
39903 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
39905 ior = gen_reg_rtx (V32QImode);
39906 emit_insn (gen_iorv32qi3 (ior, l, h));
39908 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
39909 op = gen_lowpart (V4DImode, d->target);
39910 ior = gen_lowpart (V4DImode, ior);
39911 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
39912 const1_rtx, GEN_INT (3)));
39914 return true;
39917 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
39918 and extract-odd permutations. */
39920 static bool
39921 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
39923 rtx t1, t2, t3;
39925 switch (d->vmode)
39927 case V4DFmode:
39928 t1 = gen_reg_rtx (V4DFmode);
39929 t2 = gen_reg_rtx (V4DFmode);
39931 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
39932 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
39933 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
39935 /* Now an unpck[lh]pd will produce the result required. */
39936 if (odd)
39937 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
39938 else
39939 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
39940 emit_insn (t3);
39941 break;
39943 case V8SFmode:
39945 int mask = odd ? 0xdd : 0x88;
39947 t1 = gen_reg_rtx (V8SFmode);
39948 t2 = gen_reg_rtx (V8SFmode);
39949 t3 = gen_reg_rtx (V8SFmode);
39951 /* Shuffle within the 128-bit lanes to produce:
39952 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
39953 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
39954 GEN_INT (mask)));
39956 /* Shuffle the lanes around to produce:
39957 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
39958 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
39959 GEN_INT (0x3)));
39961 /* Shuffle within the 128-bit lanes to produce:
39962 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
39963 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
39965 /* Shuffle within the 128-bit lanes to produce:
39966 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
39967 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
39969 /* Shuffle the lanes around to produce:
39970 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
39971 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
39972 GEN_INT (0x20)));
39974 break;
39976 case V2DFmode:
39977 case V4SFmode:
39978 case V2DImode:
39979 case V4SImode:
39980 /* These are always directly implementable by expand_vec_perm_1. */
39981 gcc_unreachable ();
39983 case V8HImode:
39984 if (TARGET_SSSE3)
39985 return expand_vec_perm_pshufb2 (d);
39986 else
39988 /* We need 2*log2(N)-1 operations to achieve odd/even
39989 with interleave. */
39990 t1 = gen_reg_rtx (V8HImode);
39991 t2 = gen_reg_rtx (V8HImode);
39992 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
39993 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
39994 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
39995 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
39996 if (odd)
39997 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
39998 else
39999 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
40000 emit_insn (t3);
40002 break;
40004 case V16QImode:
40005 if (TARGET_SSSE3)
40006 return expand_vec_perm_pshufb2 (d);
40007 else
40009 t1 = gen_reg_rtx (V16QImode);
40010 t2 = gen_reg_rtx (V16QImode);
40011 t3 = gen_reg_rtx (V16QImode);
40012 emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
40013 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
40014 emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
40015 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
40016 emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
40017 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
40018 if (odd)
40019 t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
40020 else
40021 t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
40022 emit_insn (t3);
40024 break;
40026 case V16HImode:
40027 case V32QImode:
40028 return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
40030 case V4DImode:
40031 if (!TARGET_AVX2)
40033 struct expand_vec_perm_d d_copy = *d;
40034 d_copy.vmode = V4DFmode;
40035 d_copy.target = gen_lowpart (V4DFmode, d->target);
40036 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
40037 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
40038 return expand_vec_perm_even_odd_1 (&d_copy, odd);
40041 t1 = gen_reg_rtx (V4DImode);
40042 t2 = gen_reg_rtx (V4DImode);
40044 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
40045 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
40046 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
40048 /* Now an vpunpck[lh]qdq will produce the result required. */
40049 if (odd)
40050 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
40051 else
40052 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
40053 emit_insn (t3);
40054 break;
40056 case V8SImode:
40057 if (!TARGET_AVX2)
40059 struct expand_vec_perm_d d_copy = *d;
40060 d_copy.vmode = V8SFmode;
40061 d_copy.target = gen_lowpart (V8SFmode, d->target);
40062 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
40063 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
40064 return expand_vec_perm_even_odd_1 (&d_copy, odd);
40067 t1 = gen_reg_rtx (V8SImode);
40068 t2 = gen_reg_rtx (V8SImode);
40070 /* Shuffle the lanes around into
40071 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
40072 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1),
40073 gen_lowpart (V4DImode, d->op0),
40074 gen_lowpart (V4DImode, d->op1),
40075 GEN_INT (0x20)));
40076 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2),
40077 gen_lowpart (V4DImode, d->op0),
40078 gen_lowpart (V4DImode, d->op1),
40079 GEN_INT (0x31)));
40081 /* Swap the 2nd and 3rd position in each lane into
40082 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
40083 emit_insn (gen_avx2_pshufdv3 (t1, t1,
40084 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
40085 emit_insn (gen_avx2_pshufdv3 (t2, t2,
40086 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
40088 /* Now an vpunpck[lh]qdq will produce
40089 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
40090 if (odd)
40091 t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target),
40092 gen_lowpart (V4DImode, t1),
40093 gen_lowpart (V4DImode, t2));
40094 else
40095 t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target),
40096 gen_lowpart (V4DImode, t1),
40097 gen_lowpart (V4DImode, t2));
40098 emit_insn (t3);
40099 break;
40101 default:
40102 gcc_unreachable ();
40105 return true;
40108 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
40109 extract-even and extract-odd permutations. */
40111 static bool
40112 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
40114 unsigned i, odd, nelt = d->nelt;
40116 odd = d->perm[0];
40117 if (odd != 0 && odd != 1)
40118 return false;
40120 for (i = 1; i < nelt; ++i)
40121 if (d->perm[i] != 2 * i + odd)
40122 return false;
40124 return expand_vec_perm_even_odd_1 (d, odd);
40127 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
40128 permutations. We assume that expand_vec_perm_1 has already failed. */
40130 static bool
40131 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
40133 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
40134 enum machine_mode vmode = d->vmode;
40135 unsigned char perm2[4];
40136 rtx op0 = d->op0;
40137 bool ok;
40139 switch (vmode)
40141 case V4DFmode:
40142 case V8SFmode:
40143 /* These are special-cased in sse.md so that we can optionally
40144 use the vbroadcast instruction. They expand to two insns
40145 if the input happens to be in a register. */
40146 gcc_unreachable ();
40148 case V2DFmode:
40149 case V2DImode:
40150 case V4SFmode:
40151 case V4SImode:
40152 /* These are always implementable using standard shuffle patterns. */
40153 gcc_unreachable ();
40155 case V8HImode:
40156 case V16QImode:
40157 /* These can be implemented via interleave. We save one insn by
40158 stopping once we have promoted to V4SImode and then use pshufd. */
40161 rtx dest;
40162 rtx (*gen) (rtx, rtx, rtx)
40163 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
40164 : gen_vec_interleave_lowv8hi;
40166 if (elt >= nelt2)
40168 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
40169 : gen_vec_interleave_highv8hi;
40170 elt -= nelt2;
40172 nelt2 /= 2;
40174 dest = gen_reg_rtx (vmode);
40175 emit_insn (gen (dest, op0, op0));
40176 vmode = get_mode_wider_vector (vmode);
40177 op0 = gen_lowpart (vmode, dest);
40179 while (vmode != V4SImode);
40181 memset (perm2, elt, 4);
40182 ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4,
40183 d->testing_p);
40184 gcc_assert (ok);
40185 return true;
40187 case V32QImode:
40188 case V16HImode:
40189 case V8SImode:
40190 case V4DImode:
40191 /* For AVX2 broadcasts of the first element vpbroadcast* or
40192 vpermq should be used by expand_vec_perm_1. */
40193 gcc_assert (!TARGET_AVX2 || d->perm[0]);
40194 return false;
40196 default:
40197 gcc_unreachable ();
40201 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
40202 broadcast permutations. */
40204 static bool
40205 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
40207 unsigned i, elt, nelt = d->nelt;
40209 if (!d->one_operand_p)
40210 return false;
40212 elt = d->perm[0];
40213 for (i = 1; i < nelt; ++i)
40214 if (d->perm[i] != elt)
40215 return false;
40217 return expand_vec_perm_broadcast_1 (d);
40220 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
40221 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
40222 all the shorter instruction sequences. */
40224 static bool
40225 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
40227 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
40228 unsigned int i, nelt, eltsz;
40229 bool used[4];
40231 if (!TARGET_AVX2
40232 || d->one_operand_p
40233 || (d->vmode != V32QImode && d->vmode != V16HImode))
40234 return false;
40236 if (d->testing_p)
40237 return true;
40239 nelt = d->nelt;
40240 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
40242 /* Generate 4 permutation masks. If the required element is within
40243 the same lane, it is shuffled in. If the required element from the
40244 other lane, force a zero by setting bit 7 in the permutation mask.
40245 In the other mask the mask has non-negative elements if element
40246 is requested from the other lane, but also moved to the other lane,
40247 so that the result of vpshufb can have the two V2TImode halves
40248 swapped. */
40249 m128 = GEN_INT (-128);
40250 for (i = 0; i < 32; ++i)
40252 rperm[0][i] = m128;
40253 rperm[1][i] = m128;
40254 rperm[2][i] = m128;
40255 rperm[3][i] = m128;
40257 used[0] = false;
40258 used[1] = false;
40259 used[2] = false;
40260 used[3] = false;
40261 for (i = 0; i < nelt; ++i)
40263 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
40264 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
40265 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
40267 for (j = 0; j < eltsz; ++j)
40268 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
40269 used[which] = true;
40272 for (i = 0; i < 2; ++i)
40274 if (!used[2 * i + 1])
40276 h[i] = NULL_RTX;
40277 continue;
40279 vperm = gen_rtx_CONST_VECTOR (V32QImode,
40280 gen_rtvec_v (32, rperm[2 * i + 1]));
40281 vperm = force_reg (V32QImode, vperm);
40282 h[i] = gen_reg_rtx (V32QImode);
40283 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
40284 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
40287 /* Swap the 128-byte lanes of h[X]. */
40288 for (i = 0; i < 2; ++i)
40290 if (h[i] == NULL_RTX)
40291 continue;
40292 op = gen_reg_rtx (V4DImode);
40293 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
40294 const2_rtx, GEN_INT (3), const0_rtx,
40295 const1_rtx));
40296 h[i] = gen_lowpart (V32QImode, op);
40299 for (i = 0; i < 2; ++i)
40301 if (!used[2 * i])
40303 l[i] = NULL_RTX;
40304 continue;
40306 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
40307 vperm = force_reg (V32QImode, vperm);
40308 l[i] = gen_reg_rtx (V32QImode);
40309 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
40310 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
40313 for (i = 0; i < 2; ++i)
40315 if (h[i] && l[i])
40317 op = gen_reg_rtx (V32QImode);
40318 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
40319 l[i] = op;
40321 else if (h[i])
40322 l[i] = h[i];
40325 gcc_assert (l[0] && l[1]);
40326 op = gen_lowpart (V32QImode, d->target);
40327 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
40328 return true;
40331 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
40332 With all of the interface bits taken care of, perform the expansion
40333 in D and return true on success. */
40335 static bool
40336 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
40338 /* Try a single instruction expansion. */
40339 if (expand_vec_perm_1 (d))
40340 return true;
40342 /* Try sequences of two instructions. */
40344 if (expand_vec_perm_pshuflw_pshufhw (d))
40345 return true;
40347 if (expand_vec_perm_palignr (d))
40348 return true;
40350 if (expand_vec_perm_interleave2 (d))
40351 return true;
40353 if (expand_vec_perm_broadcast (d))
40354 return true;
40356 if (expand_vec_perm_vpermq_perm_1 (d))
40357 return true;
40359 if (expand_vec_perm_vperm2f128 (d))
40360 return true;
40362 /* Try sequences of three instructions. */
40364 if (expand_vec_perm_2vperm2f128_vshuf (d))
40365 return true;
40367 if (expand_vec_perm_pshufb2 (d))
40368 return true;
40370 if (expand_vec_perm_interleave3 (d))
40371 return true;
40373 if (expand_vec_perm_vperm2f128_vblend (d))
40374 return true;
40376 /* Try sequences of four instructions. */
40378 if (expand_vec_perm_vpshufb2_vpermq (d))
40379 return true;
40381 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
40382 return true;
40384 /* ??? Look for narrow permutations whose element orderings would
40385 allow the promotion to a wider mode. */
40387 /* ??? Look for sequences of interleave or a wider permute that place
40388 the data into the correct lanes for a half-vector shuffle like
40389 pshuf[lh]w or vpermilps. */
40391 /* ??? Look for sequences of interleave that produce the desired results.
40392 The combinatorics of punpck[lh] get pretty ugly... */
40394 if (expand_vec_perm_even_odd (d))
40395 return true;
40397 /* Even longer sequences. */
40398 if (expand_vec_perm_vpshufb4_vpermq2 (d))
40399 return true;
40401 return false;
40404 /* If a permutation only uses one operand, make it clear. Returns true
40405 if the permutation references both operands. */
40407 static bool
40408 canonicalize_perm (struct expand_vec_perm_d *d)
40410 int i, which, nelt = d->nelt;
40412 for (i = which = 0; i < nelt; ++i)
40413 which |= (d->perm[i] < nelt ? 1 : 2);
40415 d->one_operand_p = true;
40416 switch (which)
40418 default:
40419 gcc_unreachable();
40421 case 3:
40422 if (!rtx_equal_p (d->op0, d->op1))
40424 d->one_operand_p = false;
40425 break;
40427 /* The elements of PERM do not suggest that only the first operand
40428 is used, but both operands are identical. Allow easier matching
40429 of the permutation by folding the permutation into the single
40430 input vector. */
40431 /* FALLTHRU */
40433 case 2:
40434 for (i = 0; i < nelt; ++i)
40435 d->perm[i] &= nelt - 1;
40436 d->op0 = d->op1;
40437 break;
40439 case 1:
40440 d->op1 = d->op0;
40441 break;
40444 return (which == 3);
40447 bool
40448 ix86_expand_vec_perm_const (rtx operands[4])
40450 struct expand_vec_perm_d d;
40451 unsigned char perm[MAX_VECT_LEN];
40452 int i, nelt;
40453 bool two_args;
40454 rtx sel;
40456 d.target = operands[0];
40457 d.op0 = operands[1];
40458 d.op1 = operands[2];
40459 sel = operands[3];
40461 d.vmode = GET_MODE (d.target);
40462 gcc_assert (VECTOR_MODE_P (d.vmode));
40463 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40464 d.testing_p = false;
40466 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
40467 gcc_assert (XVECLEN (sel, 0) == nelt);
40468 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
40470 for (i = 0; i < nelt; ++i)
40472 rtx e = XVECEXP (sel, 0, i);
40473 int ei = INTVAL (e) & (2 * nelt - 1);
40474 d.perm[i] = ei;
40475 perm[i] = ei;
40478 two_args = canonicalize_perm (&d);
40480 if (ix86_expand_vec_perm_const_1 (&d))
40481 return true;
40483 /* If the selector says both arguments are needed, but the operands are the
40484 same, the above tried to expand with one_operand_p and flattened selector.
40485 If that didn't work, retry without one_operand_p; we succeeded with that
40486 during testing. */
40487 if (two_args && d.one_operand_p)
40489 d.one_operand_p = false;
40490 memcpy (d.perm, perm, sizeof (perm));
40491 return ix86_expand_vec_perm_const_1 (&d);
40494 return false;
40497 /* Implement targetm.vectorize.vec_perm_const_ok. */
40499 static bool
40500 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
40501 const unsigned char *sel)
40503 struct expand_vec_perm_d d;
40504 unsigned int i, nelt, which;
40505 bool ret;
40507 d.vmode = vmode;
40508 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40509 d.testing_p = true;
40511 /* Given sufficient ISA support we can just return true here
40512 for selected vector modes. */
40513 if (GET_MODE_SIZE (d.vmode) == 16)
40515 /* All implementable with a single vpperm insn. */
40516 if (TARGET_XOP)
40517 return true;
40518 /* All implementable with 2 pshufb + 1 ior. */
40519 if (TARGET_SSSE3)
40520 return true;
40521 /* All implementable with shufpd or unpck[lh]pd. */
40522 if (d.nelt == 2)
40523 return true;
40526 /* Extract the values from the vector CST into the permutation
40527 array in D. */
40528 memcpy (d.perm, sel, nelt);
40529 for (i = which = 0; i < nelt; ++i)
40531 unsigned char e = d.perm[i];
40532 gcc_assert (e < 2 * nelt);
40533 which |= (e < nelt ? 1 : 2);
40536 /* For all elements from second vector, fold the elements to first. */
40537 if (which == 2)
40538 for (i = 0; i < nelt; ++i)
40539 d.perm[i] -= nelt;
40541 /* Check whether the mask can be applied to the vector type. */
40542 d.one_operand_p = (which != 3);
40544 /* Implementable with shufps or pshufd. */
40545 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
40546 return true;
40548 /* Otherwise we have to go through the motions and see if we can
40549 figure out how to generate the requested permutation. */
40550 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
40551 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
40552 if (!d.one_operand_p)
40553 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
40555 start_sequence ();
40556 ret = ix86_expand_vec_perm_const_1 (&d);
40557 end_sequence ();
40559 return ret;
40562 void
40563 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
40565 struct expand_vec_perm_d d;
40566 unsigned i, nelt;
40568 d.target = targ;
40569 d.op0 = op0;
40570 d.op1 = op1;
40571 d.vmode = GET_MODE (targ);
40572 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40573 d.one_operand_p = false;
40574 d.testing_p = false;
40576 for (i = 0; i < nelt; ++i)
40577 d.perm[i] = i * 2 + odd;
40579 /* We'll either be able to implement the permutation directly... */
40580 if (expand_vec_perm_1 (&d))
40581 return;
40583 /* ... or we use the special-case patterns. */
40584 expand_vec_perm_even_odd_1 (&d, odd);
40587 static void
40588 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
40590 struct expand_vec_perm_d d;
40591 unsigned i, nelt, base;
40592 bool ok;
40594 d.target = targ;
40595 d.op0 = op0;
40596 d.op1 = op1;
40597 d.vmode = GET_MODE (targ);
40598 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40599 d.one_operand_p = false;
40600 d.testing_p = false;
40602 base = high_p ? nelt / 2 : 0;
40603 for (i = 0; i < nelt / 2; ++i)
40605 d.perm[i * 2] = i + base;
40606 d.perm[i * 2 + 1] = i + base + nelt;
40609 /* Note that for AVX this isn't one instruction. */
40610 ok = ix86_expand_vec_perm_const_1 (&d);
40611 gcc_assert (ok);
40615 /* Expand a vector operation CODE for a V*QImode in terms of the
40616 same operation on V*HImode. */
40618 void
40619 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
40621 enum machine_mode qimode = GET_MODE (dest);
40622 enum machine_mode himode;
40623 rtx (*gen_il) (rtx, rtx, rtx);
40624 rtx (*gen_ih) (rtx, rtx, rtx);
40625 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
40626 struct expand_vec_perm_d d;
40627 bool ok, full_interleave;
40628 bool uns_p = false;
40629 int i;
40631 switch (qimode)
40633 case V16QImode:
40634 himode = V8HImode;
40635 gen_il = gen_vec_interleave_lowv16qi;
40636 gen_ih = gen_vec_interleave_highv16qi;
40637 break;
40638 case V32QImode:
40639 himode = V16HImode;
40640 gen_il = gen_avx2_interleave_lowv32qi;
40641 gen_ih = gen_avx2_interleave_highv32qi;
40642 break;
40643 default:
40644 gcc_unreachable ();
40647 op2_l = op2_h = op2;
40648 switch (code)
40650 case MULT:
40651 /* Unpack data such that we've got a source byte in each low byte of
40652 each word. We don't care what goes into the high byte of each word.
40653 Rather than trying to get zero in there, most convenient is to let
40654 it be a copy of the low byte. */
40655 op2_l = gen_reg_rtx (qimode);
40656 op2_h = gen_reg_rtx (qimode);
40657 emit_insn (gen_il (op2_l, op2, op2));
40658 emit_insn (gen_ih (op2_h, op2, op2));
40659 /* FALLTHRU */
40661 op1_l = gen_reg_rtx (qimode);
40662 op1_h = gen_reg_rtx (qimode);
40663 emit_insn (gen_il (op1_l, op1, op1));
40664 emit_insn (gen_ih (op1_h, op1, op1));
40665 full_interleave = qimode == V16QImode;
40666 break;
40668 case ASHIFT:
40669 case LSHIFTRT:
40670 uns_p = true;
40671 /* FALLTHRU */
40672 case ASHIFTRT:
40673 op1_l = gen_reg_rtx (himode);
40674 op1_h = gen_reg_rtx (himode);
40675 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
40676 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
40677 full_interleave = true;
40678 break;
40679 default:
40680 gcc_unreachable ();
40683 /* Perform the operation. */
40684 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
40685 1, OPTAB_DIRECT);
40686 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
40687 1, OPTAB_DIRECT);
40688 gcc_assert (res_l && res_h);
40690 /* Merge the data back into the right place. */
40691 d.target = dest;
40692 d.op0 = gen_lowpart (qimode, res_l);
40693 d.op1 = gen_lowpart (qimode, res_h);
40694 d.vmode = qimode;
40695 d.nelt = GET_MODE_NUNITS (qimode);
40696 d.one_operand_p = false;
40697 d.testing_p = false;
40699 if (full_interleave)
40701 /* For SSE2, we used an full interleave, so the desired
40702 results are in the even elements. */
40703 for (i = 0; i < 32; ++i)
40704 d.perm[i] = i * 2;
40706 else
40708 /* For AVX, the interleave used above was not cross-lane. So the
40709 extraction is evens but with the second and third quarter swapped.
40710 Happily, that is even one insn shorter than even extraction. */
40711 for (i = 0; i < 32; ++i)
40712 d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
40715 ok = ix86_expand_vec_perm_const_1 (&d);
40716 gcc_assert (ok);
40718 set_unique_reg_note (get_last_insn (), REG_EQUAL,
40719 gen_rtx_fmt_ee (code, qimode, op1, op2));
40722 void
40723 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
40724 bool uns_p, bool odd_p)
40726 enum machine_mode mode = GET_MODE (op1);
40727 enum machine_mode wmode = GET_MODE (dest);
40728 rtx x;
40730 /* We only play even/odd games with vectors of SImode. */
40731 gcc_assert (mode == V4SImode || mode == V8SImode);
40733 /* If we're looking for the odd results, shift those members down to
40734 the even slots. For some cpus this is faster than a PSHUFD. */
40735 if (odd_p)
40737 if (TARGET_XOP && mode == V4SImode)
40739 x = force_reg (wmode, CONST0_RTX (wmode));
40740 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
40741 return;
40744 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
40745 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
40746 x, NULL, 1, OPTAB_DIRECT);
40747 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
40748 x, NULL, 1, OPTAB_DIRECT);
40749 op1 = gen_lowpart (mode, op1);
40750 op2 = gen_lowpart (mode, op2);
40753 if (mode == V8SImode)
40755 if (uns_p)
40756 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
40757 else
40758 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
40760 else if (uns_p)
40761 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
40762 else if (TARGET_SSE4_1)
40763 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
40764 else
40766 rtx s1, s2, t0, t1, t2;
40768 /* The easiest way to implement this without PMULDQ is to go through
40769 the motions as if we are performing a full 64-bit multiply. With
40770 the exception that we need to do less shuffling of the elements. */
40772 /* Compute the sign-extension, aka highparts, of the two operands. */
40773 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
40774 op1, pc_rtx, pc_rtx);
40775 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
40776 op2, pc_rtx, pc_rtx);
40778 /* Multiply LO(A) * HI(B), and vice-versa. */
40779 t1 = gen_reg_rtx (wmode);
40780 t2 = gen_reg_rtx (wmode);
40781 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
40782 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
40784 /* Multiply LO(A) * LO(B). */
40785 t0 = gen_reg_rtx (wmode);
40786 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
40788 /* Combine and shift the highparts into place. */
40789 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
40790 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
40791 1, OPTAB_DIRECT);
40793 /* Combine high and low parts. */
40794 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
40795 return;
40797 emit_insn (x);
40800 void
40801 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
40802 bool uns_p, bool high_p)
40804 enum machine_mode wmode = GET_MODE (dest);
40805 enum machine_mode mode = GET_MODE (op1);
40806 rtx t1, t2, t3, t4, mask;
40808 switch (mode)
40810 case V4SImode:
40811 t1 = gen_reg_rtx (mode);
40812 t2 = gen_reg_rtx (mode);
40813 if (TARGET_XOP && !uns_p)
40815 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
40816 shuffle the elements once so that all elements are in the right
40817 place for immediate use: { A C B D }. */
40818 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
40819 const1_rtx, GEN_INT (3)));
40820 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
40821 const1_rtx, GEN_INT (3)));
40823 else
40825 /* Put the elements into place for the multiply. */
40826 ix86_expand_vec_interleave (t1, op1, op1, high_p);
40827 ix86_expand_vec_interleave (t2, op2, op2, high_p);
40828 high_p = false;
40830 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
40831 break;
40833 case V8SImode:
40834 /* Shuffle the elements between the lanes. After this we
40835 have { A B E F | C D G H } for each operand. */
40836 t1 = gen_reg_rtx (V4DImode);
40837 t2 = gen_reg_rtx (V4DImode);
40838 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
40839 const0_rtx, const2_rtx,
40840 const1_rtx, GEN_INT (3)));
40841 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
40842 const0_rtx, const2_rtx,
40843 const1_rtx, GEN_INT (3)));
40845 /* Shuffle the elements within the lanes. After this we
40846 have { A A B B | C C D D } or { E E F F | G G H H }. */
40847 t3 = gen_reg_rtx (V8SImode);
40848 t4 = gen_reg_rtx (V8SImode);
40849 mask = GEN_INT (high_p
40850 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
40851 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
40852 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
40853 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
40855 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
40856 break;
40858 case V8HImode:
40859 case V16HImode:
40860 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
40861 uns_p, OPTAB_DIRECT);
40862 t2 = expand_binop (mode,
40863 uns_p ? umul_highpart_optab : smul_highpart_optab,
40864 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
40865 gcc_assert (t1 && t2);
40867 ix86_expand_vec_interleave (gen_lowpart (mode, dest), t1, t2, high_p);
40868 break;
40870 case V16QImode:
40871 case V32QImode:
40872 t1 = gen_reg_rtx (wmode);
40873 t2 = gen_reg_rtx (wmode);
40874 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
40875 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
40877 emit_insn (gen_rtx_SET (VOIDmode, dest, gen_rtx_MULT (wmode, t1, t2)));
40878 break;
40880 default:
40881 gcc_unreachable ();
40885 void
40886 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
40888 rtx res_1, res_2;
40890 res_1 = gen_reg_rtx (V4SImode);
40891 res_2 = gen_reg_rtx (V4SImode);
40892 ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_1),
40893 op1, op2, true, false);
40894 ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_2),
40895 op1, op2, true, true);
40897 /* Move the results in element 2 down to element 1; we don't care
40898 what goes in elements 2 and 3. Then we can merge the parts
40899 back together with an interleave.
40901 Note that two other sequences were tried:
40902 (1) Use interleaves at the start instead of psrldq, which allows
40903 us to use a single shufps to merge things back at the end.
40904 (2) Use shufps here to combine the two vectors, then pshufd to
40905 put the elements in the correct order.
40906 In both cases the cost of the reformatting stall was too high
40907 and the overall sequence slower. */
40909 emit_insn (gen_sse2_pshufd_1 (res_1, res_1, const0_rtx, const2_rtx,
40910 const0_rtx, const0_rtx));
40911 emit_insn (gen_sse2_pshufd_1 (res_2, res_2, const0_rtx, const2_rtx,
40912 const0_rtx, const0_rtx));
40913 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
40915 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
40918 void
40919 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
40921 enum machine_mode mode = GET_MODE (op0);
40922 rtx t1, t2, t3, t4, t5, t6;
40924 if (TARGET_XOP && mode == V2DImode)
40926 /* op1: A,B,C,D, op2: E,F,G,H */
40927 op1 = gen_lowpart (V4SImode, op1);
40928 op2 = gen_lowpart (V4SImode, op2);
40930 t1 = gen_reg_rtx (V4SImode);
40931 t2 = gen_reg_rtx (V4SImode);
40932 t3 = gen_reg_rtx (V2DImode);
40933 t4 = gen_reg_rtx (V2DImode);
40935 /* t1: B,A,D,C */
40936 emit_insn (gen_sse2_pshufd_1 (t1, op1,
40937 GEN_INT (1),
40938 GEN_INT (0),
40939 GEN_INT (3),
40940 GEN_INT (2)));
40942 /* t2: (B*E),(A*F),(D*G),(C*H) */
40943 emit_insn (gen_mulv4si3 (t2, t1, op2));
40945 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
40946 emit_insn (gen_xop_phadddq (t3, t2));
40948 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
40949 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
40951 /* op0: (((B*E)+(A*F))<<32)+(B*F), (((D*G)+(C*H))<<32)+(D*H) */
40952 emit_insn (gen_xop_pmacsdql (op0, op1, op2, t4));
40954 else
40956 enum machine_mode nmode;
40957 rtx (*umul) (rtx, rtx, rtx);
40959 if (mode == V2DImode)
40961 umul = gen_vec_widen_umult_even_v4si;
40962 nmode = V4SImode;
40964 else if (mode == V4DImode)
40966 umul = gen_vec_widen_umult_even_v8si;
40967 nmode = V8SImode;
40969 else
40970 gcc_unreachable ();
40973 /* Multiply low parts. */
40974 t1 = gen_reg_rtx (mode);
40975 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
40977 /* Shift input vectors right 32 bits so we can multiply high parts. */
40978 t6 = GEN_INT (32);
40979 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
40980 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
40982 /* Multiply high parts by low parts. */
40983 t4 = gen_reg_rtx (mode);
40984 t5 = gen_reg_rtx (mode);
40985 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
40986 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
40988 /* Combine and shift the highparts back. */
40989 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
40990 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
40992 /* Combine high and low parts. */
40993 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
40996 set_unique_reg_note (get_last_insn (), REG_EQUAL,
40997 gen_rtx_MULT (mode, op1, op2));
41000 /* Expand an insert into a vector register through pinsr insn.
41001 Return true if successful. */
41003 bool
41004 ix86_expand_pinsr (rtx *operands)
41006 rtx dst = operands[0];
41007 rtx src = operands[3];
41009 unsigned int size = INTVAL (operands[1]);
41010 unsigned int pos = INTVAL (operands[2]);
41012 if (GET_CODE (dst) == SUBREG)
41014 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
41015 dst = SUBREG_REG (dst);
41018 if (GET_CODE (src) == SUBREG)
41019 src = SUBREG_REG (src);
41021 switch (GET_MODE (dst))
41023 case V16QImode:
41024 case V8HImode:
41025 case V4SImode:
41026 case V2DImode:
41028 enum machine_mode srcmode, dstmode;
41029 rtx (*pinsr)(rtx, rtx, rtx, rtx);
41031 srcmode = mode_for_size (size, MODE_INT, 0);
41033 switch (srcmode)
41035 case QImode:
41036 if (!TARGET_SSE4_1)
41037 return false;
41038 dstmode = V16QImode;
41039 pinsr = gen_sse4_1_pinsrb;
41040 break;
41042 case HImode:
41043 if (!TARGET_SSE2)
41044 return false;
41045 dstmode = V8HImode;
41046 pinsr = gen_sse2_pinsrw;
41047 break;
41049 case SImode:
41050 if (!TARGET_SSE4_1)
41051 return false;
41052 dstmode = V4SImode;
41053 pinsr = gen_sse4_1_pinsrd;
41054 break;
41056 case DImode:
41057 gcc_assert (TARGET_64BIT);
41058 if (!TARGET_SSE4_1)
41059 return false;
41060 dstmode = V2DImode;
41061 pinsr = gen_sse4_1_pinsrq;
41062 break;
41064 default:
41065 return false;
41068 dst = gen_lowpart (dstmode, dst);
41069 src = gen_lowpart (srcmode, src);
41071 pos /= size;
41073 emit_insn (pinsr (dst, dst, src, GEN_INT (1 << pos)));
41074 return true;
41077 default:
41078 return false;
41082 /* This function returns the calling abi specific va_list type node.
41083 It returns the FNDECL specific va_list type. */
41085 static tree
41086 ix86_fn_abi_va_list (tree fndecl)
41088 if (!TARGET_64BIT)
41089 return va_list_type_node;
41090 gcc_assert (fndecl != NULL_TREE);
41092 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
41093 return ms_va_list_type_node;
41094 else
41095 return sysv_va_list_type_node;
41098 /* Returns the canonical va_list type specified by TYPE. If there
41099 is no valid TYPE provided, it return NULL_TREE. */
41101 static tree
41102 ix86_canonical_va_list_type (tree type)
41104 tree wtype, htype;
41106 /* Resolve references and pointers to va_list type. */
41107 if (TREE_CODE (type) == MEM_REF)
41108 type = TREE_TYPE (type);
41109 else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
41110 type = TREE_TYPE (type);
41111 else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
41112 type = TREE_TYPE (type);
41114 if (TARGET_64BIT && va_list_type_node != NULL_TREE)
41116 wtype = va_list_type_node;
41117 gcc_assert (wtype != NULL_TREE);
41118 htype = type;
41119 if (TREE_CODE (wtype) == ARRAY_TYPE)
41121 /* If va_list is an array type, the argument may have decayed
41122 to a pointer type, e.g. by being passed to another function.
41123 In that case, unwrap both types so that we can compare the
41124 underlying records. */
41125 if (TREE_CODE (htype) == ARRAY_TYPE
41126 || POINTER_TYPE_P (htype))
41128 wtype = TREE_TYPE (wtype);
41129 htype = TREE_TYPE (htype);
41132 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
41133 return va_list_type_node;
41134 wtype = sysv_va_list_type_node;
41135 gcc_assert (wtype != NULL_TREE);
41136 htype = type;
41137 if (TREE_CODE (wtype) == ARRAY_TYPE)
41139 /* If va_list is an array type, the argument may have decayed
41140 to a pointer type, e.g. by being passed to another function.
41141 In that case, unwrap both types so that we can compare the
41142 underlying records. */
41143 if (TREE_CODE (htype) == ARRAY_TYPE
41144 || POINTER_TYPE_P (htype))
41146 wtype = TREE_TYPE (wtype);
41147 htype = TREE_TYPE (htype);
41150 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
41151 return sysv_va_list_type_node;
41152 wtype = ms_va_list_type_node;
41153 gcc_assert (wtype != NULL_TREE);
41154 htype = type;
41155 if (TREE_CODE (wtype) == ARRAY_TYPE)
41157 /* If va_list is an array type, the argument may have decayed
41158 to a pointer type, e.g. by being passed to another function.
41159 In that case, unwrap both types so that we can compare the
41160 underlying records. */
41161 if (TREE_CODE (htype) == ARRAY_TYPE
41162 || POINTER_TYPE_P (htype))
41164 wtype = TREE_TYPE (wtype);
41165 htype = TREE_TYPE (htype);
41168 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
41169 return ms_va_list_type_node;
41170 return NULL_TREE;
41172 return std_canonical_va_list_type (type);
41175 /* Iterate through the target-specific builtin types for va_list.
41176 IDX denotes the iterator, *PTREE is set to the result type of
41177 the va_list builtin, and *PNAME to its internal type.
41178 Returns zero if there is no element for this index, otherwise
41179 IDX should be increased upon the next call.
41180 Note, do not iterate a base builtin's name like __builtin_va_list.
41181 Used from c_common_nodes_and_builtins. */
41183 static int
41184 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
41186 if (TARGET_64BIT)
41188 switch (idx)
41190 default:
41191 break;
41193 case 0:
41194 *ptree = ms_va_list_type_node;
41195 *pname = "__builtin_ms_va_list";
41196 return 1;
41198 case 1:
41199 *ptree = sysv_va_list_type_node;
41200 *pname = "__builtin_sysv_va_list";
41201 return 1;
41205 return 0;
41208 #undef TARGET_SCHED_DISPATCH
41209 #define TARGET_SCHED_DISPATCH has_dispatch
41210 #undef TARGET_SCHED_DISPATCH_DO
41211 #define TARGET_SCHED_DISPATCH_DO do_dispatch
41212 #undef TARGET_SCHED_REASSOCIATION_WIDTH
41213 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
41214 #undef TARGET_SCHED_REORDER
41215 #define TARGET_SCHED_REORDER ix86_sched_reorder
41216 #undef TARGET_SCHED_ADJUST_PRIORITY
41217 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
41218 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
41219 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK ix86_dependencies_evaluation_hook
41221 /* The size of the dispatch window is the total number of bytes of
41222 object code allowed in a window. */
41223 #define DISPATCH_WINDOW_SIZE 16
41225 /* Number of dispatch windows considered for scheduling. */
41226 #define MAX_DISPATCH_WINDOWS 3
41228 /* Maximum number of instructions in a window. */
41229 #define MAX_INSN 4
41231 /* Maximum number of immediate operands in a window. */
41232 #define MAX_IMM 4
41234 /* Maximum number of immediate bits allowed in a window. */
41235 #define MAX_IMM_SIZE 128
41237 /* Maximum number of 32 bit immediates allowed in a window. */
41238 #define MAX_IMM_32 4
41240 /* Maximum number of 64 bit immediates allowed in a window. */
41241 #define MAX_IMM_64 2
41243 /* Maximum total of loads or prefetches allowed in a window. */
41244 #define MAX_LOAD 2
41246 /* Maximum total of stores allowed in a window. */
41247 #define MAX_STORE 1
41249 #undef BIG
41250 #define BIG 100
41253 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
41254 enum dispatch_group {
41255 disp_no_group = 0,
41256 disp_load,
41257 disp_store,
41258 disp_load_store,
41259 disp_prefetch,
41260 disp_imm,
41261 disp_imm_32,
41262 disp_imm_64,
41263 disp_branch,
41264 disp_cmp,
41265 disp_jcc,
41266 disp_last
41269 /* Number of allowable groups in a dispatch window. It is an array
41270 indexed by dispatch_group enum. 100 is used as a big number,
41271 because the number of these kind of operations does not have any
41272 effect in dispatch window, but we need them for other reasons in
41273 the table. */
41274 static unsigned int num_allowable_groups[disp_last] = {
41275 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
41278 char group_name[disp_last + 1][16] = {
41279 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
41280 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
41281 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
41284 /* Instruction path. */
41285 enum insn_path {
41286 no_path = 0,
41287 path_single, /* Single micro op. */
41288 path_double, /* Double micro op. */
41289 path_multi, /* Instructions with more than 2 micro op.. */
41290 last_path
41293 /* sched_insn_info defines a window to the instructions scheduled in
41294 the basic block. It contains a pointer to the insn_info table and
41295 the instruction scheduled.
41297 Windows are allocated for each basic block and are linked
41298 together. */
41299 typedef struct sched_insn_info_s {
41300 rtx insn;
41301 enum dispatch_group group;
41302 enum insn_path path;
41303 int byte_len;
41304 int imm_bytes;
41305 } sched_insn_info;
41307 /* Linked list of dispatch windows. This is a two way list of
41308 dispatch windows of a basic block. It contains information about
41309 the number of uops in the window and the total number of
41310 instructions and of bytes in the object code for this dispatch
41311 window. */
41312 typedef struct dispatch_windows_s {
41313 int num_insn; /* Number of insn in the window. */
41314 int num_uops; /* Number of uops in the window. */
41315 int window_size; /* Number of bytes in the window. */
41316 int window_num; /* Window number between 0 or 1. */
41317 int num_imm; /* Number of immediates in an insn. */
41318 int num_imm_32; /* Number of 32 bit immediates in an insn. */
41319 int num_imm_64; /* Number of 64 bit immediates in an insn. */
41320 int imm_size; /* Total immediates in the window. */
41321 int num_loads; /* Total memory loads in the window. */
41322 int num_stores; /* Total memory stores in the window. */
41323 int violation; /* Violation exists in window. */
41324 sched_insn_info *window; /* Pointer to the window. */
41325 struct dispatch_windows_s *next;
41326 struct dispatch_windows_s *prev;
41327 } dispatch_windows;
41329 /* Immediate valuse used in an insn. */
41330 typedef struct imm_info_s
41332 int imm;
41333 int imm32;
41334 int imm64;
41335 } imm_info;
41337 static dispatch_windows *dispatch_window_list;
41338 static dispatch_windows *dispatch_window_list1;
41340 /* Get dispatch group of insn. */
41342 static enum dispatch_group
41343 get_mem_group (rtx insn)
41345 enum attr_memory memory;
41347 if (INSN_CODE (insn) < 0)
41348 return disp_no_group;
41349 memory = get_attr_memory (insn);
41350 if (memory == MEMORY_STORE)
41351 return disp_store;
41353 if (memory == MEMORY_LOAD)
41354 return disp_load;
41356 if (memory == MEMORY_BOTH)
41357 return disp_load_store;
41359 return disp_no_group;
41362 /* Return true if insn is a compare instruction. */
41364 static bool
41365 is_cmp (rtx insn)
41367 enum attr_type type;
41369 type = get_attr_type (insn);
41370 return (type == TYPE_TEST
41371 || type == TYPE_ICMP
41372 || type == TYPE_FCMP
41373 || GET_CODE (PATTERN (insn)) == COMPARE);
41376 /* Return true if a dispatch violation encountered. */
41378 static bool
41379 dispatch_violation (void)
41381 if (dispatch_window_list->next)
41382 return dispatch_window_list->next->violation;
41383 return dispatch_window_list->violation;
41386 /* Return true if insn is a branch instruction. */
41388 static bool
41389 is_branch (rtx insn)
41391 return (CALL_P (insn) || JUMP_P (insn));
41394 /* Return true if insn is a prefetch instruction. */
41396 static bool
41397 is_prefetch (rtx insn)
41399 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
41402 /* This function initializes a dispatch window and the list container holding a
41403 pointer to the window. */
41405 static void
41406 init_window (int window_num)
41408 int i;
41409 dispatch_windows *new_list;
41411 if (window_num == 0)
41412 new_list = dispatch_window_list;
41413 else
41414 new_list = dispatch_window_list1;
41416 new_list->num_insn = 0;
41417 new_list->num_uops = 0;
41418 new_list->window_size = 0;
41419 new_list->next = NULL;
41420 new_list->prev = NULL;
41421 new_list->window_num = window_num;
41422 new_list->num_imm = 0;
41423 new_list->num_imm_32 = 0;
41424 new_list->num_imm_64 = 0;
41425 new_list->imm_size = 0;
41426 new_list->num_loads = 0;
41427 new_list->num_stores = 0;
41428 new_list->violation = false;
41430 for (i = 0; i < MAX_INSN; i++)
41432 new_list->window[i].insn = NULL;
41433 new_list->window[i].group = disp_no_group;
41434 new_list->window[i].path = no_path;
41435 new_list->window[i].byte_len = 0;
41436 new_list->window[i].imm_bytes = 0;
41438 return;
41441 /* This function allocates and initializes a dispatch window and the
41442 list container holding a pointer to the window. */
41444 static dispatch_windows *
41445 allocate_window (void)
41447 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
41448 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
41450 return new_list;
41453 /* This routine initializes the dispatch scheduling information. It
41454 initiates building dispatch scheduler tables and constructs the
41455 first dispatch window. */
41457 static void
41458 init_dispatch_sched (void)
41460 /* Allocate a dispatch list and a window. */
41461 dispatch_window_list = allocate_window ();
41462 dispatch_window_list1 = allocate_window ();
41463 init_window (0);
41464 init_window (1);
41467 /* This function returns true if a branch is detected. End of a basic block
41468 does not have to be a branch, but here we assume only branches end a
41469 window. */
41471 static bool
41472 is_end_basic_block (enum dispatch_group group)
41474 return group == disp_branch;
41477 /* This function is called when the end of a window processing is reached. */
41479 static void
41480 process_end_window (void)
41482 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
41483 if (dispatch_window_list->next)
41485 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
41486 gcc_assert (dispatch_window_list->window_size
41487 + dispatch_window_list1->window_size <= 48);
41488 init_window (1);
41490 init_window (0);
41493 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
41494 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
41495 for 48 bytes of instructions. Note that these windows are not dispatch
41496 windows that their sizes are DISPATCH_WINDOW_SIZE. */
41498 static dispatch_windows *
41499 allocate_next_window (int window_num)
41501 if (window_num == 0)
41503 if (dispatch_window_list->next)
41504 init_window (1);
41505 init_window (0);
41506 return dispatch_window_list;
41509 dispatch_window_list->next = dispatch_window_list1;
41510 dispatch_window_list1->prev = dispatch_window_list;
41512 return dispatch_window_list1;
41515 /* Increment the number of immediate operands of an instruction. */
41517 static int
41518 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
41520 if (*in_rtx == 0)
41521 return 0;
41523 switch ( GET_CODE (*in_rtx))
41525 case CONST:
41526 case SYMBOL_REF:
41527 case CONST_INT:
41528 (imm_values->imm)++;
41529 if (x86_64_immediate_operand (*in_rtx, SImode))
41530 (imm_values->imm32)++;
41531 else
41532 (imm_values->imm64)++;
41533 break;
41535 case CONST_DOUBLE:
41536 (imm_values->imm)++;
41537 (imm_values->imm64)++;
41538 break;
41540 case CODE_LABEL:
41541 if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
41543 (imm_values->imm)++;
41544 (imm_values->imm32)++;
41546 break;
41548 default:
41549 break;
41552 return 0;
41555 /* Compute number of immediate operands of an instruction. */
41557 static void
41558 find_constant (rtx in_rtx, imm_info *imm_values)
41560 for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
41561 (rtx_function) find_constant_1, (void *) imm_values);
41564 /* Return total size of immediate operands of an instruction along with number
41565 of corresponding immediate-operands. It initializes its parameters to zero
41566 befor calling FIND_CONSTANT.
41567 INSN is the input instruction. IMM is the total of immediates.
41568 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
41569 bit immediates. */
41571 static int
41572 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
41574 imm_info imm_values = {0, 0, 0};
41576 find_constant (insn, &imm_values);
41577 *imm = imm_values.imm;
41578 *imm32 = imm_values.imm32;
41579 *imm64 = imm_values.imm64;
41580 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
41583 /* This function indicates if an operand of an instruction is an
41584 immediate. */
41586 static bool
41587 has_immediate (rtx insn)
41589 int num_imm_operand;
41590 int num_imm32_operand;
41591 int num_imm64_operand;
41593 if (insn)
41594 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
41595 &num_imm64_operand);
41596 return false;
41599 /* Return single or double path for instructions. */
41601 static enum insn_path
41602 get_insn_path (rtx insn)
41604 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
41606 if ((int)path == 0)
41607 return path_single;
41609 if ((int)path == 1)
41610 return path_double;
41612 return path_multi;
41615 /* Return insn dispatch group. */
41617 static enum dispatch_group
41618 get_insn_group (rtx insn)
41620 enum dispatch_group group = get_mem_group (insn);
41621 if (group)
41622 return group;
41624 if (is_branch (insn))
41625 return disp_branch;
41627 if (is_cmp (insn))
41628 return disp_cmp;
41630 if (has_immediate (insn))
41631 return disp_imm;
41633 if (is_prefetch (insn))
41634 return disp_prefetch;
41636 return disp_no_group;
41639 /* Count number of GROUP restricted instructions in a dispatch
41640 window WINDOW_LIST. */
41642 static int
41643 count_num_restricted (rtx insn, dispatch_windows *window_list)
41645 enum dispatch_group group = get_insn_group (insn);
41646 int imm_size;
41647 int num_imm_operand;
41648 int num_imm32_operand;
41649 int num_imm64_operand;
41651 if (group == disp_no_group)
41652 return 0;
41654 if (group == disp_imm)
41656 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
41657 &num_imm64_operand);
41658 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
41659 || num_imm_operand + window_list->num_imm > MAX_IMM
41660 || (num_imm32_operand > 0
41661 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
41662 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
41663 || (num_imm64_operand > 0
41664 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
41665 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
41666 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
41667 && num_imm64_operand > 0
41668 && ((window_list->num_imm_64 > 0
41669 && window_list->num_insn >= 2)
41670 || window_list->num_insn >= 3)))
41671 return BIG;
41673 return 1;
41676 if ((group == disp_load_store
41677 && (window_list->num_loads >= MAX_LOAD
41678 || window_list->num_stores >= MAX_STORE))
41679 || ((group == disp_load
41680 || group == disp_prefetch)
41681 && window_list->num_loads >= MAX_LOAD)
41682 || (group == disp_store
41683 && window_list->num_stores >= MAX_STORE))
41684 return BIG;
41686 return 1;
41689 /* This function returns true if insn satisfies dispatch rules on the
41690 last window scheduled. */
41692 static bool
41693 fits_dispatch_window (rtx insn)
41695 dispatch_windows *window_list = dispatch_window_list;
41696 dispatch_windows *window_list_next = dispatch_window_list->next;
41697 unsigned int num_restrict;
41698 enum dispatch_group group = get_insn_group (insn);
41699 enum insn_path path = get_insn_path (insn);
41700 int sum;
41702 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
41703 instructions should be given the lowest priority in the
41704 scheduling process in Haifa scheduler to make sure they will be
41705 scheduled in the same dispatch window as the reference to them. */
41706 if (group == disp_jcc || group == disp_cmp)
41707 return false;
41709 /* Check nonrestricted. */
41710 if (group == disp_no_group || group == disp_branch)
41711 return true;
41713 /* Get last dispatch window. */
41714 if (window_list_next)
41715 window_list = window_list_next;
41717 if (window_list->window_num == 1)
41719 sum = window_list->prev->window_size + window_list->window_size;
41721 if (sum == 32
41722 || (min_insn_size (insn) + sum) >= 48)
41723 /* Window 1 is full. Go for next window. */
41724 return true;
41727 num_restrict = count_num_restricted (insn, window_list);
41729 if (num_restrict > num_allowable_groups[group])
41730 return false;
41732 /* See if it fits in the first window. */
41733 if (window_list->window_num == 0)
41735 /* The first widow should have only single and double path
41736 uops. */
41737 if (path == path_double
41738 && (window_list->num_uops + 2) > MAX_INSN)
41739 return false;
41740 else if (path != path_single)
41741 return false;
41743 return true;
41746 /* Add an instruction INSN with NUM_UOPS micro-operations to the
41747 dispatch window WINDOW_LIST. */
41749 static void
41750 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
41752 int byte_len = min_insn_size (insn);
41753 int num_insn = window_list->num_insn;
41754 int imm_size;
41755 sched_insn_info *window = window_list->window;
41756 enum dispatch_group group = get_insn_group (insn);
41757 enum insn_path path = get_insn_path (insn);
41758 int num_imm_operand;
41759 int num_imm32_operand;
41760 int num_imm64_operand;
41762 if (!window_list->violation && group != disp_cmp
41763 && !fits_dispatch_window (insn))
41764 window_list->violation = true;
41766 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
41767 &num_imm64_operand);
41769 /* Initialize window with new instruction. */
41770 window[num_insn].insn = insn;
41771 window[num_insn].byte_len = byte_len;
41772 window[num_insn].group = group;
41773 window[num_insn].path = path;
41774 window[num_insn].imm_bytes = imm_size;
41776 window_list->window_size += byte_len;
41777 window_list->num_insn = num_insn + 1;
41778 window_list->num_uops = window_list->num_uops + num_uops;
41779 window_list->imm_size += imm_size;
41780 window_list->num_imm += num_imm_operand;
41781 window_list->num_imm_32 += num_imm32_operand;
41782 window_list->num_imm_64 += num_imm64_operand;
41784 if (group == disp_store)
41785 window_list->num_stores += 1;
41786 else if (group == disp_load
41787 || group == disp_prefetch)
41788 window_list->num_loads += 1;
41789 else if (group == disp_load_store)
41791 window_list->num_stores += 1;
41792 window_list->num_loads += 1;
41796 /* Adds a scheduled instruction, INSN, to the current dispatch window.
41797 If the total bytes of instructions or the number of instructions in
41798 the window exceed allowable, it allocates a new window. */
41800 static void
41801 add_to_dispatch_window (rtx insn)
41803 int byte_len;
41804 dispatch_windows *window_list;
41805 dispatch_windows *next_list;
41806 dispatch_windows *window0_list;
41807 enum insn_path path;
41808 enum dispatch_group insn_group;
41809 bool insn_fits;
41810 int num_insn;
41811 int num_uops;
41812 int window_num;
41813 int insn_num_uops;
41814 int sum;
41816 if (INSN_CODE (insn) < 0)
41817 return;
41819 byte_len = min_insn_size (insn);
41820 window_list = dispatch_window_list;
41821 next_list = window_list->next;
41822 path = get_insn_path (insn);
41823 insn_group = get_insn_group (insn);
41825 /* Get the last dispatch window. */
41826 if (next_list)
41827 window_list = dispatch_window_list->next;
41829 if (path == path_single)
41830 insn_num_uops = 1;
41831 else if (path == path_double)
41832 insn_num_uops = 2;
41833 else
41834 insn_num_uops = (int) path;
41836 /* If current window is full, get a new window.
41837 Window number zero is full, if MAX_INSN uops are scheduled in it.
41838 Window number one is full, if window zero's bytes plus window
41839 one's bytes is 32, or if the bytes of the new instruction added
41840 to the total makes it greater than 48, or it has already MAX_INSN
41841 instructions in it. */
41842 num_insn = window_list->num_insn;
41843 num_uops = window_list->num_uops;
41844 window_num = window_list->window_num;
41845 insn_fits = fits_dispatch_window (insn);
41847 if (num_insn >= MAX_INSN
41848 || num_uops + insn_num_uops > MAX_INSN
41849 || !(insn_fits))
41851 window_num = ~window_num & 1;
41852 window_list = allocate_next_window (window_num);
41855 if (window_num == 0)
41857 add_insn_window (insn, window_list, insn_num_uops);
41858 if (window_list->num_insn >= MAX_INSN
41859 && insn_group == disp_branch)
41861 process_end_window ();
41862 return;
41865 else if (window_num == 1)
41867 window0_list = window_list->prev;
41868 sum = window0_list->window_size + window_list->window_size;
41869 if (sum == 32
41870 || (byte_len + sum) >= 48)
41872 process_end_window ();
41873 window_list = dispatch_window_list;
41876 add_insn_window (insn, window_list, insn_num_uops);
41878 else
41879 gcc_unreachable ();
41881 if (is_end_basic_block (insn_group))
41883 /* End of basic block is reached do end-basic-block process. */
41884 process_end_window ();
41885 return;
41889 /* Print the dispatch window, WINDOW_NUM, to FILE. */
41891 DEBUG_FUNCTION static void
41892 debug_dispatch_window_file (FILE *file, int window_num)
41894 dispatch_windows *list;
41895 int i;
41897 if (window_num == 0)
41898 list = dispatch_window_list;
41899 else
41900 list = dispatch_window_list1;
41902 fprintf (file, "Window #%d:\n", list->window_num);
41903 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
41904 list->num_insn, list->num_uops, list->window_size);
41905 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
41906 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
41908 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
41909 list->num_stores);
41910 fprintf (file, " insn info:\n");
41912 for (i = 0; i < MAX_INSN; i++)
41914 if (!list->window[i].insn)
41915 break;
41916 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
41917 i, group_name[list->window[i].group],
41918 i, (void *)list->window[i].insn,
41919 i, list->window[i].path,
41920 i, list->window[i].byte_len,
41921 i, list->window[i].imm_bytes);
41925 /* Print to stdout a dispatch window. */
41927 DEBUG_FUNCTION void
41928 debug_dispatch_window (int window_num)
41930 debug_dispatch_window_file (stdout, window_num);
41933 /* Print INSN dispatch information to FILE. */
41935 DEBUG_FUNCTION static void
41936 debug_insn_dispatch_info_file (FILE *file, rtx insn)
41938 int byte_len;
41939 enum insn_path path;
41940 enum dispatch_group group;
41941 int imm_size;
41942 int num_imm_operand;
41943 int num_imm32_operand;
41944 int num_imm64_operand;
41946 if (INSN_CODE (insn) < 0)
41947 return;
41949 byte_len = min_insn_size (insn);
41950 path = get_insn_path (insn);
41951 group = get_insn_group (insn);
41952 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
41953 &num_imm64_operand);
41955 fprintf (file, " insn info:\n");
41956 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
41957 group_name[group], path, byte_len);
41958 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
41959 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
41962 /* Print to STDERR the status of the ready list with respect to
41963 dispatch windows. */
41965 DEBUG_FUNCTION void
41966 debug_ready_dispatch (void)
41968 int i;
41969 int no_ready = number_in_ready ();
41971 fprintf (stdout, "Number of ready: %d\n", no_ready);
41973 for (i = 0; i < no_ready; i++)
41974 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
41977 /* This routine is the driver of the dispatch scheduler. */
41979 static void
41980 do_dispatch (rtx insn, int mode)
41982 if (mode == DISPATCH_INIT)
41983 init_dispatch_sched ();
41984 else if (mode == ADD_TO_DISPATCH_WINDOW)
41985 add_to_dispatch_window (insn);
41988 /* Return TRUE if Dispatch Scheduling is supported. */
41990 static bool
41991 has_dispatch (rtx insn, int action)
41993 if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3)
41994 && flag_dispatch_scheduler)
41995 switch (action)
41997 default:
41998 return false;
42000 case IS_DISPATCH_ON:
42001 return true;
42002 break;
42004 case IS_CMP:
42005 return is_cmp (insn);
42007 case DISPATCH_VIOLATION:
42008 return dispatch_violation ();
42010 case FITS_DISPATCH_WINDOW:
42011 return fits_dispatch_window (insn);
42014 return false;
42017 /* Implementation of reassociation_width target hook used by
42018 reassoc phase to identify parallelism level in reassociated
42019 tree. Statements tree_code is passed in OPC. Arguments type
42020 is passed in MODE.
42022 Currently parallel reassociation is enabled for Atom
42023 processors only and we set reassociation width to be 2
42024 because Atom may issue up to 2 instructions per cycle.
42026 Return value should be fixed if parallel reassociation is
42027 enabled for other processors. */
42029 static int
42030 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
42031 enum machine_mode mode)
42033 int res = 1;
42035 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
42036 res = 2;
42037 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
42038 res = 2;
42040 return res;
42043 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
42044 place emms and femms instructions. */
42046 static enum machine_mode
42047 ix86_preferred_simd_mode (enum machine_mode mode)
42049 if (!TARGET_SSE)
42050 return word_mode;
42052 switch (mode)
42054 case QImode:
42055 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
42056 case HImode:
42057 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
42058 case SImode:
42059 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
42060 case DImode:
42061 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
42063 case SFmode:
42064 if (TARGET_AVX && !TARGET_PREFER_AVX128)
42065 return V8SFmode;
42066 else
42067 return V4SFmode;
42069 case DFmode:
42070 if (!TARGET_VECTORIZE_DOUBLE)
42071 return word_mode;
42072 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
42073 return V4DFmode;
42074 else if (TARGET_SSE2)
42075 return V2DFmode;
42076 /* FALLTHRU */
42078 default:
42079 return word_mode;
42083 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
42084 vectors. */
42086 static unsigned int
42087 ix86_autovectorize_vector_sizes (void)
42089 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
42094 /* Return class of registers which could be used for pseudo of MODE
42095 and of class RCLASS for spilling instead of memory. Return NO_REGS
42096 if it is not possible or non-profitable. */
42097 static reg_class_t
42098 ix86_spill_class (reg_class_t rclass, enum machine_mode mode)
42100 if (TARGET_SSE && TARGET_GENERAL_REGS_SSE_SPILL && ! TARGET_MMX
42101 && (mode == SImode || (TARGET_64BIT && mode == DImode))
42102 && INTEGER_CLASS_P (rclass))
42103 return SSE_REGS;
42104 return NO_REGS;
42107 /* Implement targetm.vectorize.init_cost. */
42109 static void *
42110 ix86_init_cost (struct loop *loop_info ATTRIBUTE_UNUSED)
42112 unsigned *cost = XNEWVEC (unsigned, 3);
42113 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
42114 return cost;
42117 /* Implement targetm.vectorize.add_stmt_cost. */
42119 static unsigned
42120 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
42121 struct _stmt_vec_info *stmt_info, int misalign,
42122 enum vect_cost_model_location where)
42124 unsigned *cost = (unsigned *) data;
42125 unsigned retval = 0;
42127 if (flag_vect_cost_model)
42129 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
42130 int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
42132 /* Statements in an inner loop relative to the loop being
42133 vectorized are weighted more heavily. The value here is
42134 arbitrary and could potentially be improved with analysis. */
42135 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
42136 count *= 50; /* FIXME. */
42138 retval = (unsigned) (count * stmt_cost);
42139 cost[where] += retval;
42142 return retval;
42145 /* Implement targetm.vectorize.finish_cost. */
42147 static void
42148 ix86_finish_cost (void *data, unsigned *prologue_cost,
42149 unsigned *body_cost, unsigned *epilogue_cost)
42151 unsigned *cost = (unsigned *) data;
42152 *prologue_cost = cost[vect_prologue];
42153 *body_cost = cost[vect_body];
42154 *epilogue_cost = cost[vect_epilogue];
42157 /* Implement targetm.vectorize.destroy_cost_data. */
42159 static void
42160 ix86_destroy_cost_data (void *data)
42162 free (data);
42165 /* Validate target specific memory model bits in VAL. */
42167 static unsigned HOST_WIDE_INT
42168 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
42170 unsigned HOST_WIDE_INT model = val & MEMMODEL_MASK;
42171 bool strong;
42173 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
42174 |MEMMODEL_MASK)
42175 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
42177 warning (OPT_Winvalid_memory_model,
42178 "Unknown architecture specific memory model");
42179 return MEMMODEL_SEQ_CST;
42181 strong = (model == MEMMODEL_ACQ_REL || model == MEMMODEL_SEQ_CST);
42182 if (val & IX86_HLE_ACQUIRE && !(model == MEMMODEL_ACQUIRE || strong))
42184 warning (OPT_Winvalid_memory_model,
42185 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
42186 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
42188 if (val & IX86_HLE_RELEASE && !(model == MEMMODEL_RELEASE || strong))
42190 warning (OPT_Winvalid_memory_model,
42191 "HLE_RELEASE not used with RELEASE or stronger memory model");
42192 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
42194 return val;
42197 /* Initialize the GCC target structure. */
42198 #undef TARGET_RETURN_IN_MEMORY
42199 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
42201 #undef TARGET_LEGITIMIZE_ADDRESS
42202 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
42204 #undef TARGET_ATTRIBUTE_TABLE
42205 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
42206 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
42207 # undef TARGET_MERGE_DECL_ATTRIBUTES
42208 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
42209 #endif
42211 #undef TARGET_COMP_TYPE_ATTRIBUTES
42212 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
42214 #undef TARGET_INIT_BUILTINS
42215 #define TARGET_INIT_BUILTINS ix86_init_builtins
42216 #undef TARGET_BUILTIN_DECL
42217 #define TARGET_BUILTIN_DECL ix86_builtin_decl
42218 #undef TARGET_EXPAND_BUILTIN
42219 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
42221 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
42222 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
42223 ix86_builtin_vectorized_function
42225 #undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
42226 #define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
42228 #undef TARGET_VECTORIZE_BUILTIN_TM_STORE
42229 #define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
42231 #undef TARGET_VECTORIZE_BUILTIN_GATHER
42232 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
42234 #undef TARGET_BUILTIN_RECIPROCAL
42235 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
42237 #undef TARGET_ASM_FUNCTION_EPILOGUE
42238 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
42240 #undef TARGET_ENCODE_SECTION_INFO
42241 #ifndef SUBTARGET_ENCODE_SECTION_INFO
42242 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
42243 #else
42244 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
42245 #endif
42247 #undef TARGET_ASM_OPEN_PAREN
42248 #define TARGET_ASM_OPEN_PAREN ""
42249 #undef TARGET_ASM_CLOSE_PAREN
42250 #define TARGET_ASM_CLOSE_PAREN ""
42252 #undef TARGET_ASM_BYTE_OP
42253 #define TARGET_ASM_BYTE_OP ASM_BYTE
42255 #undef TARGET_ASM_ALIGNED_HI_OP
42256 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
42257 #undef TARGET_ASM_ALIGNED_SI_OP
42258 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
42259 #ifdef ASM_QUAD
42260 #undef TARGET_ASM_ALIGNED_DI_OP
42261 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
42262 #endif
42264 #undef TARGET_PROFILE_BEFORE_PROLOGUE
42265 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
42267 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
42268 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
42270 #undef TARGET_ASM_UNALIGNED_HI_OP
42271 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
42272 #undef TARGET_ASM_UNALIGNED_SI_OP
42273 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
42274 #undef TARGET_ASM_UNALIGNED_DI_OP
42275 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
42277 #undef TARGET_PRINT_OPERAND
42278 #define TARGET_PRINT_OPERAND ix86_print_operand
42279 #undef TARGET_PRINT_OPERAND_ADDRESS
42280 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
42281 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
42282 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
42283 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
42284 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
42286 #undef TARGET_SCHED_INIT_GLOBAL
42287 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
42288 #undef TARGET_SCHED_ADJUST_COST
42289 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
42290 #undef TARGET_SCHED_ISSUE_RATE
42291 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
42292 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
42293 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
42294 ia32_multipass_dfa_lookahead
42296 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
42297 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
42299 #undef TARGET_MEMMODEL_CHECK
42300 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
42302 #ifdef HAVE_AS_TLS
42303 #undef TARGET_HAVE_TLS
42304 #define TARGET_HAVE_TLS true
42305 #endif
42306 #undef TARGET_CANNOT_FORCE_CONST_MEM
42307 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
42308 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
42309 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
42311 #undef TARGET_DELEGITIMIZE_ADDRESS
42312 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
42314 #undef TARGET_MS_BITFIELD_LAYOUT_P
42315 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
42317 #if TARGET_MACHO
42318 #undef TARGET_BINDS_LOCAL_P
42319 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
42320 #endif
42321 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
42322 #undef TARGET_BINDS_LOCAL_P
42323 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
42324 #endif
42326 #undef TARGET_ASM_OUTPUT_MI_THUNK
42327 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
42328 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
42329 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
42331 #undef TARGET_ASM_FILE_START
42332 #define TARGET_ASM_FILE_START x86_file_start
42334 #undef TARGET_OPTION_OVERRIDE
42335 #define TARGET_OPTION_OVERRIDE ix86_option_override
42337 #undef TARGET_REGISTER_MOVE_COST
42338 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
42339 #undef TARGET_MEMORY_MOVE_COST
42340 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
42341 #undef TARGET_RTX_COSTS
42342 #define TARGET_RTX_COSTS ix86_rtx_costs
42343 #undef TARGET_ADDRESS_COST
42344 #define TARGET_ADDRESS_COST ix86_address_cost
42346 #undef TARGET_FIXED_CONDITION_CODE_REGS
42347 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
42348 #undef TARGET_CC_MODES_COMPATIBLE
42349 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
42351 #undef TARGET_MACHINE_DEPENDENT_REORG
42352 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
42354 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
42355 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
42357 #undef TARGET_BUILD_BUILTIN_VA_LIST
42358 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
42360 #undef TARGET_FOLD_BUILTIN
42361 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
42363 #undef TARGET_COMPARE_VERSION_PRIORITY
42364 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
42366 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
42367 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
42368 ix86_generate_version_dispatcher_body
42370 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
42371 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
42372 ix86_get_function_versions_dispatcher
42374 #undef TARGET_ENUM_VA_LIST_P
42375 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
42377 #undef TARGET_FN_ABI_VA_LIST
42378 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
42380 #undef TARGET_CANONICAL_VA_LIST_TYPE
42381 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
42383 #undef TARGET_EXPAND_BUILTIN_VA_START
42384 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
42386 #undef TARGET_MD_ASM_CLOBBERS
42387 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
42389 #undef TARGET_PROMOTE_PROTOTYPES
42390 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
42391 #undef TARGET_STRUCT_VALUE_RTX
42392 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
42393 #undef TARGET_SETUP_INCOMING_VARARGS
42394 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
42395 #undef TARGET_MUST_PASS_IN_STACK
42396 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
42397 #undef TARGET_FUNCTION_ARG_ADVANCE
42398 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
42399 #undef TARGET_FUNCTION_ARG
42400 #define TARGET_FUNCTION_ARG ix86_function_arg
42401 #undef TARGET_FUNCTION_ARG_BOUNDARY
42402 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
42403 #undef TARGET_PASS_BY_REFERENCE
42404 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
42405 #undef TARGET_INTERNAL_ARG_POINTER
42406 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
42407 #undef TARGET_UPDATE_STACK_BOUNDARY
42408 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
42409 #undef TARGET_GET_DRAP_RTX
42410 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
42411 #undef TARGET_STRICT_ARGUMENT_NAMING
42412 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
42413 #undef TARGET_STATIC_CHAIN
42414 #define TARGET_STATIC_CHAIN ix86_static_chain
42415 #undef TARGET_TRAMPOLINE_INIT
42416 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
42417 #undef TARGET_RETURN_POPS_ARGS
42418 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
42420 #undef TARGET_LEGITIMATE_COMBINED_INSN
42421 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
42423 #undef TARGET_ASAN_SHADOW_OFFSET
42424 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
42426 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
42427 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
42429 #undef TARGET_SCALAR_MODE_SUPPORTED_P
42430 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
42432 #undef TARGET_VECTOR_MODE_SUPPORTED_P
42433 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
42435 #undef TARGET_C_MODE_FOR_SUFFIX
42436 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
42438 #ifdef HAVE_AS_TLS
42439 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
42440 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
42441 #endif
42443 #ifdef SUBTARGET_INSERT_ATTRIBUTES
42444 #undef TARGET_INSERT_ATTRIBUTES
42445 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
42446 #endif
42448 #undef TARGET_MANGLE_TYPE
42449 #define TARGET_MANGLE_TYPE ix86_mangle_type
42451 #if !TARGET_MACHO
42452 #undef TARGET_STACK_PROTECT_FAIL
42453 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
42454 #endif
42456 #undef TARGET_FUNCTION_VALUE
42457 #define TARGET_FUNCTION_VALUE ix86_function_value
42459 #undef TARGET_FUNCTION_VALUE_REGNO_P
42460 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
42462 #undef TARGET_PROMOTE_FUNCTION_MODE
42463 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
42465 #undef TARGET_MEMBER_TYPE_FORCES_BLK
42466 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
42468 #undef TARGET_INSTANTIATE_DECLS
42469 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
42471 #undef TARGET_SECONDARY_RELOAD
42472 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
42474 #undef TARGET_CLASS_MAX_NREGS
42475 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
42477 #undef TARGET_PREFERRED_RELOAD_CLASS
42478 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
42479 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
42480 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
42481 #undef TARGET_CLASS_LIKELY_SPILLED_P
42482 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
42484 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
42485 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
42486 ix86_builtin_vectorization_cost
42487 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
42488 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
42489 ix86_vectorize_vec_perm_const_ok
42490 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
42491 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
42492 ix86_preferred_simd_mode
42493 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
42494 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
42495 ix86_autovectorize_vector_sizes
42496 #undef TARGET_VECTORIZE_INIT_COST
42497 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
42498 #undef TARGET_VECTORIZE_ADD_STMT_COST
42499 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
42500 #undef TARGET_VECTORIZE_FINISH_COST
42501 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
42502 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
42503 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
42505 #undef TARGET_SET_CURRENT_FUNCTION
42506 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
42508 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
42509 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
42511 #undef TARGET_OPTION_SAVE
42512 #define TARGET_OPTION_SAVE ix86_function_specific_save
42514 #undef TARGET_OPTION_RESTORE
42515 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
42517 #undef TARGET_OPTION_PRINT
42518 #define TARGET_OPTION_PRINT ix86_function_specific_print
42520 #undef TARGET_OPTION_FUNCTION_VERSIONS
42521 #define TARGET_OPTION_FUNCTION_VERSIONS ix86_function_versions
42523 #undef TARGET_CAN_INLINE_P
42524 #define TARGET_CAN_INLINE_P ix86_can_inline_p
42526 #undef TARGET_EXPAND_TO_RTL_HOOK
42527 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
42529 #undef TARGET_LEGITIMATE_ADDRESS_P
42530 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
42532 #undef TARGET_LRA_P
42533 #define TARGET_LRA_P hook_bool_void_true
42535 #undef TARGET_REGISTER_PRIORITY
42536 #define TARGET_REGISTER_PRIORITY ix86_register_priority
42538 #undef TARGET_LEGITIMATE_CONSTANT_P
42539 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
42541 #undef TARGET_FRAME_POINTER_REQUIRED
42542 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
42544 #undef TARGET_CAN_ELIMINATE
42545 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
42547 #undef TARGET_EXTRA_LIVE_ON_ENTRY
42548 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
42550 #undef TARGET_ASM_CODE_END
42551 #define TARGET_ASM_CODE_END ix86_code_end
42553 #undef TARGET_CONDITIONAL_REGISTER_USAGE
42554 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
42556 #if TARGET_MACHO
42557 #undef TARGET_INIT_LIBFUNCS
42558 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
42559 #endif
42561 #undef TARGET_SPILL_CLASS
42562 #define TARGET_SPILL_CLASS ix86_spill_class
42564 struct gcc_target targetm = TARGET_INITIALIZER;
42566 #include "gt-i386.h"