* config/i386/i386.c (initial_ix86_tune_features): Turn on fp
[official-gcc.git] / gcc / config / i386 / i386.c
blobf9f266a3dc6f3a191f6de54dbe15f4775bfb7982
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2013 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tm.h"
24 #include "rtl.h"
25 #include "tree.h"
26 #include "tm_p.h"
27 #include "regs.h"
28 #include "hard-reg-set.h"
29 #include "insn-config.h"
30 #include "conditions.h"
31 #include "output.h"
32 #include "insn-codes.h"
33 #include "insn-attr.h"
34 #include "flags.h"
35 #include "except.h"
36 #include "function.h"
37 #include "recog.h"
38 #include "expr.h"
39 #include "optabs.h"
40 #include "diagnostic-core.h"
41 #include "toplev.h"
42 #include "basic-block.h"
43 #include "ggc.h"
44 #include "target.h"
45 #include "target-def.h"
46 #include "common/common-target.h"
47 #include "langhooks.h"
48 #include "reload.h"
49 #include "cgraph.h"
50 #include "gimple.h"
51 #include "dwarf2.h"
52 #include "df.h"
53 #include "tm-constrs.h"
54 #include "params.h"
55 #include "cselib.h"
56 #include "debug.h"
57 #include "sched-int.h"
58 #include "sbitmap.h"
59 #include "fibheap.h"
60 #include "opts.h"
61 #include "diagnostic.h"
62 #include "dumpfile.h"
63 #include "tree-pass.h"
64 #include "tree-flow.h"
66 static rtx legitimize_dllimport_symbol (rtx, bool);
68 #ifndef CHECK_STACK_LIMIT
69 #define CHECK_STACK_LIMIT (-1)
70 #endif
72 /* Return index of given mode in mult and division cost tables. */
73 #define MODE_INDEX(mode) \
74 ((mode) == QImode ? 0 \
75 : (mode) == HImode ? 1 \
76 : (mode) == SImode ? 2 \
77 : (mode) == DImode ? 3 \
78 : 4)
80 /* Processor costs (relative to an add) */
81 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
82 #define COSTS_N_BYTES(N) ((N) * 2)
84 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
86 const
87 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
88 COSTS_N_BYTES (2), /* cost of an add instruction */
89 COSTS_N_BYTES (3), /* cost of a lea instruction */
90 COSTS_N_BYTES (2), /* variable shift costs */
91 COSTS_N_BYTES (3), /* constant shift costs */
92 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
93 COSTS_N_BYTES (3), /* HI */
94 COSTS_N_BYTES (3), /* SI */
95 COSTS_N_BYTES (3), /* DI */
96 COSTS_N_BYTES (5)}, /* other */
97 0, /* cost of multiply per each bit set */
98 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
99 COSTS_N_BYTES (3), /* HI */
100 COSTS_N_BYTES (3), /* SI */
101 COSTS_N_BYTES (3), /* DI */
102 COSTS_N_BYTES (5)}, /* other */
103 COSTS_N_BYTES (3), /* cost of movsx */
104 COSTS_N_BYTES (3), /* cost of movzx */
105 0, /* "large" insn */
106 2, /* MOVE_RATIO */
107 2, /* cost for loading QImode using movzbl */
108 {2, 2, 2}, /* cost of loading integer registers
109 in QImode, HImode and SImode.
110 Relative to reg-reg move (2). */
111 {2, 2, 2}, /* cost of storing integer registers */
112 2, /* cost of reg,reg fld/fst */
113 {2, 2, 2}, /* cost of loading fp registers
114 in SFmode, DFmode and XFmode */
115 {2, 2, 2}, /* cost of storing fp registers
116 in SFmode, DFmode and XFmode */
117 3, /* cost of moving MMX register */
118 {3, 3}, /* cost of loading MMX registers
119 in SImode and DImode */
120 {3, 3}, /* cost of storing MMX registers
121 in SImode and DImode */
122 3, /* cost of moving SSE register */
123 {3, 3, 3}, /* cost of loading SSE registers
124 in SImode, DImode and TImode */
125 {3, 3, 3}, /* cost of storing SSE registers
126 in SImode, DImode and TImode */
127 3, /* MMX or SSE register to integer */
128 0, /* size of l1 cache */
129 0, /* size of l2 cache */
130 0, /* size of prefetch block */
131 0, /* number of parallel prefetches */
132 2, /* Branch cost */
133 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
134 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
135 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
136 COSTS_N_BYTES (2), /* cost of FABS instruction. */
137 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
138 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
139 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
140 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}},
141 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
142 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}},
143 1, /* scalar_stmt_cost. */
144 1, /* scalar load_cost. */
145 1, /* scalar_store_cost. */
146 1, /* vec_stmt_cost. */
147 1, /* vec_to_scalar_cost. */
148 1, /* scalar_to_vec_cost. */
149 1, /* vec_align_load_cost. */
150 1, /* vec_unalign_load_cost. */
151 1, /* vec_store_cost. */
152 1, /* cond_taken_branch_cost. */
153 1, /* cond_not_taken_branch_cost. */
156 /* Processor costs (relative to an add) */
157 static const
158 struct processor_costs i386_cost = { /* 386 specific costs */
159 COSTS_N_INSNS (1), /* cost of an add instruction */
160 COSTS_N_INSNS (1), /* cost of a lea instruction */
161 COSTS_N_INSNS (3), /* variable shift costs */
162 COSTS_N_INSNS (2), /* constant shift costs */
163 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
164 COSTS_N_INSNS (6), /* HI */
165 COSTS_N_INSNS (6), /* SI */
166 COSTS_N_INSNS (6), /* DI */
167 COSTS_N_INSNS (6)}, /* other */
168 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
169 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
170 COSTS_N_INSNS (23), /* HI */
171 COSTS_N_INSNS (23), /* SI */
172 COSTS_N_INSNS (23), /* DI */
173 COSTS_N_INSNS (23)}, /* other */
174 COSTS_N_INSNS (3), /* cost of movsx */
175 COSTS_N_INSNS (2), /* cost of movzx */
176 15, /* "large" insn */
177 3, /* MOVE_RATIO */
178 4, /* cost for loading QImode using movzbl */
179 {2, 4, 2}, /* cost of loading integer registers
180 in QImode, HImode and SImode.
181 Relative to reg-reg move (2). */
182 {2, 4, 2}, /* cost of storing integer registers */
183 2, /* cost of reg,reg fld/fst */
184 {8, 8, 8}, /* cost of loading fp registers
185 in SFmode, DFmode and XFmode */
186 {8, 8, 8}, /* cost of storing fp registers
187 in SFmode, DFmode and XFmode */
188 2, /* cost of moving MMX register */
189 {4, 8}, /* cost of loading MMX registers
190 in SImode and DImode */
191 {4, 8}, /* cost of storing MMX registers
192 in SImode and DImode */
193 2, /* cost of moving SSE register */
194 {4, 8, 16}, /* cost of loading SSE registers
195 in SImode, DImode and TImode */
196 {4, 8, 16}, /* cost of storing SSE registers
197 in SImode, DImode and TImode */
198 3, /* MMX or SSE register to integer */
199 0, /* size of l1 cache */
200 0, /* size of l2 cache */
201 0, /* size of prefetch block */
202 0, /* number of parallel prefetches */
203 1, /* Branch cost */
204 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
205 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
206 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
207 COSTS_N_INSNS (22), /* cost of FABS instruction. */
208 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
209 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
210 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
211 DUMMY_STRINGOP_ALGS},
212 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
213 DUMMY_STRINGOP_ALGS},
214 1, /* scalar_stmt_cost. */
215 1, /* scalar load_cost. */
216 1, /* scalar_store_cost. */
217 1, /* vec_stmt_cost. */
218 1, /* vec_to_scalar_cost. */
219 1, /* scalar_to_vec_cost. */
220 1, /* vec_align_load_cost. */
221 2, /* vec_unalign_load_cost. */
222 1, /* vec_store_cost. */
223 3, /* cond_taken_branch_cost. */
224 1, /* cond_not_taken_branch_cost. */
227 static const
228 struct processor_costs i486_cost = { /* 486 specific costs */
229 COSTS_N_INSNS (1), /* cost of an add instruction */
230 COSTS_N_INSNS (1), /* cost of a lea instruction */
231 COSTS_N_INSNS (3), /* variable shift costs */
232 COSTS_N_INSNS (2), /* constant shift costs */
233 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
234 COSTS_N_INSNS (12), /* HI */
235 COSTS_N_INSNS (12), /* SI */
236 COSTS_N_INSNS (12), /* DI */
237 COSTS_N_INSNS (12)}, /* other */
238 1, /* cost of multiply per each bit set */
239 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
240 COSTS_N_INSNS (40), /* HI */
241 COSTS_N_INSNS (40), /* SI */
242 COSTS_N_INSNS (40), /* DI */
243 COSTS_N_INSNS (40)}, /* other */
244 COSTS_N_INSNS (3), /* cost of movsx */
245 COSTS_N_INSNS (2), /* cost of movzx */
246 15, /* "large" insn */
247 3, /* MOVE_RATIO */
248 4, /* cost for loading QImode using movzbl */
249 {2, 4, 2}, /* cost of loading integer registers
250 in QImode, HImode and SImode.
251 Relative to reg-reg move (2). */
252 {2, 4, 2}, /* cost of storing integer registers */
253 2, /* cost of reg,reg fld/fst */
254 {8, 8, 8}, /* cost of loading fp registers
255 in SFmode, DFmode and XFmode */
256 {8, 8, 8}, /* cost of storing fp registers
257 in SFmode, DFmode and XFmode */
258 2, /* cost of moving MMX register */
259 {4, 8}, /* cost of loading MMX registers
260 in SImode and DImode */
261 {4, 8}, /* cost of storing MMX registers
262 in SImode and DImode */
263 2, /* cost of moving SSE register */
264 {4, 8, 16}, /* cost of loading SSE registers
265 in SImode, DImode and TImode */
266 {4, 8, 16}, /* cost of storing SSE registers
267 in SImode, DImode and TImode */
268 3, /* MMX or SSE register to integer */
269 4, /* size of l1 cache. 486 has 8kB cache
270 shared for code and data, so 4kB is
271 not really precise. */
272 4, /* size of l2 cache */
273 0, /* size of prefetch block */
274 0, /* number of parallel prefetches */
275 1, /* Branch cost */
276 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
277 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
278 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
279 COSTS_N_INSNS (3), /* cost of FABS instruction. */
280 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
281 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
282 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
283 DUMMY_STRINGOP_ALGS},
284 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
285 DUMMY_STRINGOP_ALGS},
286 1, /* scalar_stmt_cost. */
287 1, /* scalar load_cost. */
288 1, /* scalar_store_cost. */
289 1, /* vec_stmt_cost. */
290 1, /* vec_to_scalar_cost. */
291 1, /* scalar_to_vec_cost. */
292 1, /* vec_align_load_cost. */
293 2, /* vec_unalign_load_cost. */
294 1, /* vec_store_cost. */
295 3, /* cond_taken_branch_cost. */
296 1, /* cond_not_taken_branch_cost. */
299 static const
300 struct processor_costs pentium_cost = {
301 COSTS_N_INSNS (1), /* cost of an add instruction */
302 COSTS_N_INSNS (1), /* cost of a lea instruction */
303 COSTS_N_INSNS (4), /* variable shift costs */
304 COSTS_N_INSNS (1), /* constant shift costs */
305 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
306 COSTS_N_INSNS (11), /* HI */
307 COSTS_N_INSNS (11), /* SI */
308 COSTS_N_INSNS (11), /* DI */
309 COSTS_N_INSNS (11)}, /* other */
310 0, /* cost of multiply per each bit set */
311 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
312 COSTS_N_INSNS (25), /* HI */
313 COSTS_N_INSNS (25), /* SI */
314 COSTS_N_INSNS (25), /* DI */
315 COSTS_N_INSNS (25)}, /* other */
316 COSTS_N_INSNS (3), /* cost of movsx */
317 COSTS_N_INSNS (2), /* cost of movzx */
318 8, /* "large" insn */
319 6, /* MOVE_RATIO */
320 6, /* cost for loading QImode using movzbl */
321 {2, 4, 2}, /* cost of loading integer registers
322 in QImode, HImode and SImode.
323 Relative to reg-reg move (2). */
324 {2, 4, 2}, /* cost of storing integer registers */
325 2, /* cost of reg,reg fld/fst */
326 {2, 2, 6}, /* cost of loading fp registers
327 in SFmode, DFmode and XFmode */
328 {4, 4, 6}, /* cost of storing fp registers
329 in SFmode, DFmode and XFmode */
330 8, /* cost of moving MMX register */
331 {8, 8}, /* cost of loading MMX registers
332 in SImode and DImode */
333 {8, 8}, /* cost of storing MMX registers
334 in SImode and DImode */
335 2, /* cost of moving SSE register */
336 {4, 8, 16}, /* cost of loading SSE registers
337 in SImode, DImode and TImode */
338 {4, 8, 16}, /* cost of storing SSE registers
339 in SImode, DImode and TImode */
340 3, /* MMX or SSE register to integer */
341 8, /* size of l1 cache. */
342 8, /* size of l2 cache */
343 0, /* size of prefetch block */
344 0, /* number of parallel prefetches */
345 2, /* Branch cost */
346 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
347 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
348 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
349 COSTS_N_INSNS (1), /* cost of FABS instruction. */
350 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
351 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
352 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
353 DUMMY_STRINGOP_ALGS},
354 {{libcall, {{-1, rep_prefix_4_byte, false}}},
355 DUMMY_STRINGOP_ALGS},
356 1, /* scalar_stmt_cost. */
357 1, /* scalar load_cost. */
358 1, /* scalar_store_cost. */
359 1, /* vec_stmt_cost. */
360 1, /* vec_to_scalar_cost. */
361 1, /* scalar_to_vec_cost. */
362 1, /* vec_align_load_cost. */
363 2, /* vec_unalign_load_cost. */
364 1, /* vec_store_cost. */
365 3, /* cond_taken_branch_cost. */
366 1, /* cond_not_taken_branch_cost. */
369 static const
370 struct processor_costs pentiumpro_cost = {
371 COSTS_N_INSNS (1), /* cost of an add instruction */
372 COSTS_N_INSNS (1), /* cost of a lea instruction */
373 COSTS_N_INSNS (1), /* variable shift costs */
374 COSTS_N_INSNS (1), /* constant shift costs */
375 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
376 COSTS_N_INSNS (4), /* HI */
377 COSTS_N_INSNS (4), /* SI */
378 COSTS_N_INSNS (4), /* DI */
379 COSTS_N_INSNS (4)}, /* other */
380 0, /* cost of multiply per each bit set */
381 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
382 COSTS_N_INSNS (17), /* HI */
383 COSTS_N_INSNS (17), /* SI */
384 COSTS_N_INSNS (17), /* DI */
385 COSTS_N_INSNS (17)}, /* other */
386 COSTS_N_INSNS (1), /* cost of movsx */
387 COSTS_N_INSNS (1), /* cost of movzx */
388 8, /* "large" insn */
389 6, /* MOVE_RATIO */
390 2, /* cost for loading QImode using movzbl */
391 {4, 4, 4}, /* cost of loading integer registers
392 in QImode, HImode and SImode.
393 Relative to reg-reg move (2). */
394 {2, 2, 2}, /* cost of storing integer registers */
395 2, /* cost of reg,reg fld/fst */
396 {2, 2, 6}, /* cost of loading fp registers
397 in SFmode, DFmode and XFmode */
398 {4, 4, 6}, /* cost of storing fp registers
399 in SFmode, DFmode and XFmode */
400 2, /* cost of moving MMX register */
401 {2, 2}, /* cost of loading MMX registers
402 in SImode and DImode */
403 {2, 2}, /* cost of storing MMX registers
404 in SImode and DImode */
405 2, /* cost of moving SSE register */
406 {2, 2, 8}, /* cost of loading SSE registers
407 in SImode, DImode and TImode */
408 {2, 2, 8}, /* cost of storing SSE registers
409 in SImode, DImode and TImode */
410 3, /* MMX or SSE register to integer */
411 8, /* size of l1 cache. */
412 256, /* size of l2 cache */
413 32, /* size of prefetch block */
414 6, /* number of parallel prefetches */
415 2, /* Branch cost */
416 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
417 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
418 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
419 COSTS_N_INSNS (2), /* cost of FABS instruction. */
420 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
421 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
422 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
423 (we ensure the alignment). For small blocks inline loop is still a
424 noticeable win, for bigger blocks either rep movsl or rep movsb is
425 way to go. Rep movsb has apparently more expensive startup time in CPU,
426 but after 4K the difference is down in the noise. */
427 {{rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
428 {8192, rep_prefix_4_byte, false},
429 {-1, rep_prefix_1_byte, false}}},
430 DUMMY_STRINGOP_ALGS},
431 {{rep_prefix_4_byte, {{1024, unrolled_loop, false},
432 {8192, rep_prefix_4_byte, false},
433 {-1, libcall, false}}},
434 DUMMY_STRINGOP_ALGS},
435 1, /* scalar_stmt_cost. */
436 1, /* scalar load_cost. */
437 1, /* scalar_store_cost. */
438 1, /* vec_stmt_cost. */
439 1, /* vec_to_scalar_cost. */
440 1, /* scalar_to_vec_cost. */
441 1, /* vec_align_load_cost. */
442 2, /* vec_unalign_load_cost. */
443 1, /* vec_store_cost. */
444 3, /* cond_taken_branch_cost. */
445 1, /* cond_not_taken_branch_cost. */
448 static const
449 struct processor_costs geode_cost = {
450 COSTS_N_INSNS (1), /* cost of an add instruction */
451 COSTS_N_INSNS (1), /* cost of a lea instruction */
452 COSTS_N_INSNS (2), /* variable shift costs */
453 COSTS_N_INSNS (1), /* constant shift costs */
454 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
455 COSTS_N_INSNS (4), /* HI */
456 COSTS_N_INSNS (7), /* SI */
457 COSTS_N_INSNS (7), /* DI */
458 COSTS_N_INSNS (7)}, /* other */
459 0, /* cost of multiply per each bit set */
460 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
461 COSTS_N_INSNS (23), /* HI */
462 COSTS_N_INSNS (39), /* SI */
463 COSTS_N_INSNS (39), /* DI */
464 COSTS_N_INSNS (39)}, /* other */
465 COSTS_N_INSNS (1), /* cost of movsx */
466 COSTS_N_INSNS (1), /* cost of movzx */
467 8, /* "large" insn */
468 4, /* MOVE_RATIO */
469 1, /* cost for loading QImode using movzbl */
470 {1, 1, 1}, /* cost of loading integer registers
471 in QImode, HImode and SImode.
472 Relative to reg-reg move (2). */
473 {1, 1, 1}, /* cost of storing integer registers */
474 1, /* cost of reg,reg fld/fst */
475 {1, 1, 1}, /* cost of loading fp registers
476 in SFmode, DFmode and XFmode */
477 {4, 6, 6}, /* cost of storing fp registers
478 in SFmode, DFmode and XFmode */
480 1, /* cost of moving MMX register */
481 {1, 1}, /* cost of loading MMX registers
482 in SImode and DImode */
483 {1, 1}, /* cost of storing MMX registers
484 in SImode and DImode */
485 1, /* cost of moving SSE register */
486 {1, 1, 1}, /* cost of loading SSE registers
487 in SImode, DImode and TImode */
488 {1, 1, 1}, /* cost of storing SSE registers
489 in SImode, DImode and TImode */
490 1, /* MMX or SSE register to integer */
491 64, /* size of l1 cache. */
492 128, /* size of l2 cache. */
493 32, /* size of prefetch block */
494 1, /* number of parallel prefetches */
495 1, /* Branch cost */
496 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
497 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
498 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
499 COSTS_N_INSNS (1), /* cost of FABS instruction. */
500 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
501 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
502 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
503 DUMMY_STRINGOP_ALGS},
504 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
505 DUMMY_STRINGOP_ALGS},
506 1, /* scalar_stmt_cost. */
507 1, /* scalar load_cost. */
508 1, /* scalar_store_cost. */
509 1, /* vec_stmt_cost. */
510 1, /* vec_to_scalar_cost. */
511 1, /* scalar_to_vec_cost. */
512 1, /* vec_align_load_cost. */
513 2, /* vec_unalign_load_cost. */
514 1, /* vec_store_cost. */
515 3, /* cond_taken_branch_cost. */
516 1, /* cond_not_taken_branch_cost. */
519 static const
520 struct processor_costs k6_cost = {
521 COSTS_N_INSNS (1), /* cost of an add instruction */
522 COSTS_N_INSNS (2), /* cost of a lea instruction */
523 COSTS_N_INSNS (1), /* variable shift costs */
524 COSTS_N_INSNS (1), /* constant shift costs */
525 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
526 COSTS_N_INSNS (3), /* HI */
527 COSTS_N_INSNS (3), /* SI */
528 COSTS_N_INSNS (3), /* DI */
529 COSTS_N_INSNS (3)}, /* other */
530 0, /* cost of multiply per each bit set */
531 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
532 COSTS_N_INSNS (18), /* HI */
533 COSTS_N_INSNS (18), /* SI */
534 COSTS_N_INSNS (18), /* DI */
535 COSTS_N_INSNS (18)}, /* other */
536 COSTS_N_INSNS (2), /* cost of movsx */
537 COSTS_N_INSNS (2), /* cost of movzx */
538 8, /* "large" insn */
539 4, /* MOVE_RATIO */
540 3, /* cost for loading QImode using movzbl */
541 {4, 5, 4}, /* cost of loading integer registers
542 in QImode, HImode and SImode.
543 Relative to reg-reg move (2). */
544 {2, 3, 2}, /* cost of storing integer registers */
545 4, /* cost of reg,reg fld/fst */
546 {6, 6, 6}, /* cost of loading fp registers
547 in SFmode, DFmode and XFmode */
548 {4, 4, 4}, /* cost of storing fp registers
549 in SFmode, DFmode and XFmode */
550 2, /* cost of moving MMX register */
551 {2, 2}, /* cost of loading MMX registers
552 in SImode and DImode */
553 {2, 2}, /* cost of storing MMX registers
554 in SImode and DImode */
555 2, /* cost of moving SSE register */
556 {2, 2, 8}, /* cost of loading SSE registers
557 in SImode, DImode and TImode */
558 {2, 2, 8}, /* cost of storing SSE registers
559 in SImode, DImode and TImode */
560 6, /* MMX or SSE register to integer */
561 32, /* size of l1 cache. */
562 32, /* size of l2 cache. Some models
563 have integrated l2 cache, but
564 optimizing for k6 is not important
565 enough to worry about that. */
566 32, /* size of prefetch block */
567 1, /* number of parallel prefetches */
568 1, /* Branch cost */
569 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
570 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
571 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
572 COSTS_N_INSNS (2), /* cost of FABS instruction. */
573 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
574 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
575 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
576 DUMMY_STRINGOP_ALGS},
577 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
578 DUMMY_STRINGOP_ALGS},
579 1, /* scalar_stmt_cost. */
580 1, /* scalar load_cost. */
581 1, /* scalar_store_cost. */
582 1, /* vec_stmt_cost. */
583 1, /* vec_to_scalar_cost. */
584 1, /* scalar_to_vec_cost. */
585 1, /* vec_align_load_cost. */
586 2, /* vec_unalign_load_cost. */
587 1, /* vec_store_cost. */
588 3, /* cond_taken_branch_cost. */
589 1, /* cond_not_taken_branch_cost. */
592 static const
593 struct processor_costs athlon_cost = {
594 COSTS_N_INSNS (1), /* cost of an add instruction */
595 COSTS_N_INSNS (2), /* cost of a lea instruction */
596 COSTS_N_INSNS (1), /* variable shift costs */
597 COSTS_N_INSNS (1), /* constant shift costs */
598 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
599 COSTS_N_INSNS (5), /* HI */
600 COSTS_N_INSNS (5), /* SI */
601 COSTS_N_INSNS (5), /* DI */
602 COSTS_N_INSNS (5)}, /* other */
603 0, /* cost of multiply per each bit set */
604 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
605 COSTS_N_INSNS (26), /* HI */
606 COSTS_N_INSNS (42), /* SI */
607 COSTS_N_INSNS (74), /* DI */
608 COSTS_N_INSNS (74)}, /* other */
609 COSTS_N_INSNS (1), /* cost of movsx */
610 COSTS_N_INSNS (1), /* cost of movzx */
611 8, /* "large" insn */
612 9, /* MOVE_RATIO */
613 4, /* cost for loading QImode using movzbl */
614 {3, 4, 3}, /* cost of loading integer registers
615 in QImode, HImode and SImode.
616 Relative to reg-reg move (2). */
617 {3, 4, 3}, /* cost of storing integer registers */
618 4, /* cost of reg,reg fld/fst */
619 {4, 4, 12}, /* cost of loading fp registers
620 in SFmode, DFmode and XFmode */
621 {6, 6, 8}, /* cost of storing fp registers
622 in SFmode, DFmode and XFmode */
623 2, /* cost of moving MMX register */
624 {4, 4}, /* cost of loading MMX registers
625 in SImode and DImode */
626 {4, 4}, /* cost of storing MMX registers
627 in SImode and DImode */
628 2, /* cost of moving SSE register */
629 {4, 4, 6}, /* cost of loading SSE registers
630 in SImode, DImode and TImode */
631 {4, 4, 5}, /* cost of storing SSE registers
632 in SImode, DImode and TImode */
633 5, /* MMX or SSE register to integer */
634 64, /* size of l1 cache. */
635 256, /* size of l2 cache. */
636 64, /* size of prefetch block */
637 6, /* number of parallel prefetches */
638 5, /* Branch cost */
639 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
640 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
641 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
642 COSTS_N_INSNS (2), /* cost of FABS instruction. */
643 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
644 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
645 /* For some reason, Athlon deals better with REP prefix (relative to loops)
646 compared to K8. Alignment becomes important after 8 bytes for memcpy and
647 128 bytes for memset. */
648 {{libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
649 DUMMY_STRINGOP_ALGS},
650 {{libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
651 DUMMY_STRINGOP_ALGS},
652 1, /* scalar_stmt_cost. */
653 1, /* scalar load_cost. */
654 1, /* scalar_store_cost. */
655 1, /* vec_stmt_cost. */
656 1, /* vec_to_scalar_cost. */
657 1, /* scalar_to_vec_cost. */
658 1, /* vec_align_load_cost. */
659 2, /* vec_unalign_load_cost. */
660 1, /* vec_store_cost. */
661 3, /* cond_taken_branch_cost. */
662 1, /* cond_not_taken_branch_cost. */
665 static const
666 struct processor_costs k8_cost = {
667 COSTS_N_INSNS (1), /* cost of an add instruction */
668 COSTS_N_INSNS (2), /* cost of a lea instruction */
669 COSTS_N_INSNS (1), /* variable shift costs */
670 COSTS_N_INSNS (1), /* constant shift costs */
671 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
672 COSTS_N_INSNS (4), /* HI */
673 COSTS_N_INSNS (3), /* SI */
674 COSTS_N_INSNS (4), /* DI */
675 COSTS_N_INSNS (5)}, /* other */
676 0, /* cost of multiply per each bit set */
677 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
678 COSTS_N_INSNS (26), /* HI */
679 COSTS_N_INSNS (42), /* SI */
680 COSTS_N_INSNS (74), /* DI */
681 COSTS_N_INSNS (74)}, /* other */
682 COSTS_N_INSNS (1), /* cost of movsx */
683 COSTS_N_INSNS (1), /* cost of movzx */
684 8, /* "large" insn */
685 9, /* MOVE_RATIO */
686 4, /* cost for loading QImode using movzbl */
687 {3, 4, 3}, /* cost of loading integer registers
688 in QImode, HImode and SImode.
689 Relative to reg-reg move (2). */
690 {3, 4, 3}, /* cost of storing integer registers */
691 4, /* cost of reg,reg fld/fst */
692 {4, 4, 12}, /* cost of loading fp registers
693 in SFmode, DFmode and XFmode */
694 {6, 6, 8}, /* cost of storing fp registers
695 in SFmode, DFmode and XFmode */
696 2, /* cost of moving MMX register */
697 {3, 3}, /* cost of loading MMX registers
698 in SImode and DImode */
699 {4, 4}, /* cost of storing MMX registers
700 in SImode and DImode */
701 2, /* cost of moving SSE register */
702 {4, 3, 6}, /* cost of loading SSE registers
703 in SImode, DImode and TImode */
704 {4, 4, 5}, /* cost of storing SSE registers
705 in SImode, DImode and TImode */
706 5, /* MMX or SSE register to integer */
707 64, /* size of l1 cache. */
708 512, /* size of l2 cache. */
709 64, /* size of prefetch block */
710 /* New AMD processors never drop prefetches; if they cannot be performed
711 immediately, they are queued. We set number of simultaneous prefetches
712 to a large constant to reflect this (it probably is not a good idea not
713 to limit number of prefetches at all, as their execution also takes some
714 time). */
715 100, /* number of parallel prefetches */
716 3, /* Branch cost */
717 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
718 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
719 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
720 COSTS_N_INSNS (2), /* cost of FABS instruction. */
721 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
722 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
723 /* K8 has optimized REP instruction for medium sized blocks, but for very
724 small blocks it is better to use loop. For large blocks, libcall can
725 do nontemporary accesses and beat inline considerably. */
726 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
727 {-1, rep_prefix_4_byte, false}}},
728 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
729 {-1, libcall, false}}}},
730 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
731 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
732 {libcall, {{48, unrolled_loop, false},
733 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
734 4, /* scalar_stmt_cost. */
735 2, /* scalar load_cost. */
736 2, /* scalar_store_cost. */
737 5, /* vec_stmt_cost. */
738 0, /* vec_to_scalar_cost. */
739 2, /* scalar_to_vec_cost. */
740 2, /* vec_align_load_cost. */
741 3, /* vec_unalign_load_cost. */
742 3, /* vec_store_cost. */
743 3, /* cond_taken_branch_cost. */
744 2, /* cond_not_taken_branch_cost. */
747 struct processor_costs amdfam10_cost = {
748 COSTS_N_INSNS (1), /* cost of an add instruction */
749 COSTS_N_INSNS (2), /* cost of a lea instruction */
750 COSTS_N_INSNS (1), /* variable shift costs */
751 COSTS_N_INSNS (1), /* constant shift costs */
752 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
753 COSTS_N_INSNS (4), /* HI */
754 COSTS_N_INSNS (3), /* SI */
755 COSTS_N_INSNS (4), /* DI */
756 COSTS_N_INSNS (5)}, /* other */
757 0, /* cost of multiply per each bit set */
758 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
759 COSTS_N_INSNS (35), /* HI */
760 COSTS_N_INSNS (51), /* SI */
761 COSTS_N_INSNS (83), /* DI */
762 COSTS_N_INSNS (83)}, /* other */
763 COSTS_N_INSNS (1), /* cost of movsx */
764 COSTS_N_INSNS (1), /* cost of movzx */
765 8, /* "large" insn */
766 9, /* MOVE_RATIO */
767 4, /* cost for loading QImode using movzbl */
768 {3, 4, 3}, /* cost of loading integer registers
769 in QImode, HImode and SImode.
770 Relative to reg-reg move (2). */
771 {3, 4, 3}, /* cost of storing integer registers */
772 4, /* cost of reg,reg fld/fst */
773 {4, 4, 12}, /* cost of loading fp registers
774 in SFmode, DFmode and XFmode */
775 {6, 6, 8}, /* cost of storing fp registers
776 in SFmode, DFmode and XFmode */
777 2, /* cost of moving MMX register */
778 {3, 3}, /* cost of loading MMX registers
779 in SImode and DImode */
780 {4, 4}, /* cost of storing MMX registers
781 in SImode and DImode */
782 2, /* cost of moving SSE register */
783 {4, 4, 3}, /* cost of loading SSE registers
784 in SImode, DImode and TImode */
785 {4, 4, 5}, /* cost of storing SSE registers
786 in SImode, DImode and TImode */
787 3, /* MMX or SSE register to integer */
788 /* On K8:
789 MOVD reg64, xmmreg Double FSTORE 4
790 MOVD reg32, xmmreg Double FSTORE 4
791 On AMDFAM10:
792 MOVD reg64, xmmreg Double FADD 3
793 1/1 1/1
794 MOVD reg32, xmmreg Double FADD 3
795 1/1 1/1 */
796 64, /* size of l1 cache. */
797 512, /* size of l2 cache. */
798 64, /* size of prefetch block */
799 /* New AMD processors never drop prefetches; if they cannot be performed
800 immediately, they are queued. We set number of simultaneous prefetches
801 to a large constant to reflect this (it probably is not a good idea not
802 to limit number of prefetches at all, as their execution also takes some
803 time). */
804 100, /* number of parallel prefetches */
805 2, /* Branch cost */
806 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
807 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
808 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
809 COSTS_N_INSNS (2), /* cost of FABS instruction. */
810 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
811 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
813 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
814 very small blocks it is better to use loop. For large blocks, libcall can
815 do nontemporary accesses and beat inline considerably. */
816 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
817 {-1, rep_prefix_4_byte, false}}},
818 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
819 {-1, libcall, false}}}},
820 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
821 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
822 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
823 {-1, libcall, false}}}},
824 4, /* scalar_stmt_cost. */
825 2, /* scalar load_cost. */
826 2, /* scalar_store_cost. */
827 6, /* vec_stmt_cost. */
828 0, /* vec_to_scalar_cost. */
829 2, /* scalar_to_vec_cost. */
830 2, /* vec_align_load_cost. */
831 2, /* vec_unalign_load_cost. */
832 2, /* vec_store_cost. */
833 2, /* cond_taken_branch_cost. */
834 1, /* cond_not_taken_branch_cost. */
837 struct processor_costs bdver1_cost = {
838 COSTS_N_INSNS (1), /* cost of an add instruction */
839 COSTS_N_INSNS (1), /* cost of a lea instruction */
840 COSTS_N_INSNS (1), /* variable shift costs */
841 COSTS_N_INSNS (1), /* constant shift costs */
842 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
843 COSTS_N_INSNS (4), /* HI */
844 COSTS_N_INSNS (4), /* SI */
845 COSTS_N_INSNS (6), /* DI */
846 COSTS_N_INSNS (6)}, /* other */
847 0, /* cost of multiply per each bit set */
848 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
849 COSTS_N_INSNS (35), /* HI */
850 COSTS_N_INSNS (51), /* SI */
851 COSTS_N_INSNS (83), /* DI */
852 COSTS_N_INSNS (83)}, /* other */
853 COSTS_N_INSNS (1), /* cost of movsx */
854 COSTS_N_INSNS (1), /* cost of movzx */
855 8, /* "large" insn */
856 9, /* MOVE_RATIO */
857 4, /* cost for loading QImode using movzbl */
858 {5, 5, 4}, /* cost of loading integer registers
859 in QImode, HImode and SImode.
860 Relative to reg-reg move (2). */
861 {4, 4, 4}, /* cost of storing integer registers */
862 2, /* cost of reg,reg fld/fst */
863 {5, 5, 12}, /* cost of loading fp registers
864 in SFmode, DFmode and XFmode */
865 {4, 4, 8}, /* cost of storing fp registers
866 in SFmode, DFmode and XFmode */
867 2, /* cost of moving MMX register */
868 {4, 4}, /* cost of loading MMX registers
869 in SImode and DImode */
870 {4, 4}, /* cost of storing MMX registers
871 in SImode and DImode */
872 2, /* cost of moving SSE register */
873 {4, 4, 4}, /* cost of loading SSE registers
874 in SImode, DImode and TImode */
875 {4, 4, 4}, /* cost of storing SSE registers
876 in SImode, DImode and TImode */
877 2, /* MMX or SSE register to integer */
878 /* On K8:
879 MOVD reg64, xmmreg Double FSTORE 4
880 MOVD reg32, xmmreg Double FSTORE 4
881 On AMDFAM10:
882 MOVD reg64, xmmreg Double FADD 3
883 1/1 1/1
884 MOVD reg32, xmmreg Double FADD 3
885 1/1 1/1 */
886 16, /* size of l1 cache. */
887 2048, /* size of l2 cache. */
888 64, /* size of prefetch block */
889 /* New AMD processors never drop prefetches; if they cannot be performed
890 immediately, they are queued. We set number of simultaneous prefetches
891 to a large constant to reflect this (it probably is not a good idea not
892 to limit number of prefetches at all, as their execution also takes some
893 time). */
894 100, /* number of parallel prefetches */
895 2, /* Branch cost */
896 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
897 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
898 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
899 COSTS_N_INSNS (2), /* cost of FABS instruction. */
900 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
901 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
903 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
904 very small blocks it is better to use loop. For large blocks, libcall
905 can do nontemporary accesses and beat inline considerably. */
906 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
907 {-1, rep_prefix_4_byte, false}}},
908 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
909 {-1, libcall, false}}}},
910 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
911 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
912 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
913 {-1, libcall, false}}}},
914 6, /* scalar_stmt_cost. */
915 4, /* scalar load_cost. */
916 4, /* scalar_store_cost. */
917 6, /* vec_stmt_cost. */
918 0, /* vec_to_scalar_cost. */
919 2, /* scalar_to_vec_cost. */
920 4, /* vec_align_load_cost. */
921 4, /* vec_unalign_load_cost. */
922 4, /* vec_store_cost. */
923 2, /* cond_taken_branch_cost. */
924 1, /* cond_not_taken_branch_cost. */
927 struct processor_costs bdver2_cost = {
928 COSTS_N_INSNS (1), /* cost of an add instruction */
929 COSTS_N_INSNS (1), /* cost of a lea instruction */
930 COSTS_N_INSNS (1), /* variable shift costs */
931 COSTS_N_INSNS (1), /* constant shift costs */
932 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
933 COSTS_N_INSNS (4), /* HI */
934 COSTS_N_INSNS (4), /* SI */
935 COSTS_N_INSNS (6), /* DI */
936 COSTS_N_INSNS (6)}, /* other */
937 0, /* cost of multiply per each bit set */
938 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
939 COSTS_N_INSNS (35), /* HI */
940 COSTS_N_INSNS (51), /* SI */
941 COSTS_N_INSNS (83), /* DI */
942 COSTS_N_INSNS (83)}, /* other */
943 COSTS_N_INSNS (1), /* cost of movsx */
944 COSTS_N_INSNS (1), /* cost of movzx */
945 8, /* "large" insn */
946 9, /* MOVE_RATIO */
947 4, /* cost for loading QImode using movzbl */
948 {5, 5, 4}, /* cost of loading integer registers
949 in QImode, HImode and SImode.
950 Relative to reg-reg move (2). */
951 {4, 4, 4}, /* cost of storing integer registers */
952 2, /* cost of reg,reg fld/fst */
953 {5, 5, 12}, /* cost of loading fp registers
954 in SFmode, DFmode and XFmode */
955 {4, 4, 8}, /* cost of storing fp registers
956 in SFmode, DFmode and XFmode */
957 2, /* cost of moving MMX register */
958 {4, 4}, /* cost of loading MMX registers
959 in SImode and DImode */
960 {4, 4}, /* cost of storing MMX registers
961 in SImode and DImode */
962 2, /* cost of moving SSE register */
963 {4, 4, 4}, /* cost of loading SSE registers
964 in SImode, DImode and TImode */
965 {4, 4, 4}, /* cost of storing SSE registers
966 in SImode, DImode and TImode */
967 2, /* MMX or SSE register to integer */
968 /* On K8:
969 MOVD reg64, xmmreg Double FSTORE 4
970 MOVD reg32, xmmreg Double FSTORE 4
971 On AMDFAM10:
972 MOVD reg64, xmmreg Double FADD 3
973 1/1 1/1
974 MOVD reg32, xmmreg Double FADD 3
975 1/1 1/1 */
976 16, /* size of l1 cache. */
977 2048, /* size of l2 cache. */
978 64, /* size of prefetch block */
979 /* New AMD processors never drop prefetches; if they cannot be performed
980 immediately, they are queued. We set number of simultaneous prefetches
981 to a large constant to reflect this (it probably is not a good idea not
982 to limit number of prefetches at all, as their execution also takes some
983 time). */
984 100, /* number of parallel prefetches */
985 2, /* Branch cost */
986 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
987 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
988 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
989 COSTS_N_INSNS (2), /* cost of FABS instruction. */
990 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
991 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
993 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
994 very small blocks it is better to use loop. For large blocks, libcall
995 can do nontemporary accesses and beat inline considerably. */
996 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
997 {-1, rep_prefix_4_byte, false}}},
998 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
999 {-1, libcall, false}}}},
1000 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
1001 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1002 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1003 {-1, libcall, false}}}},
1004 6, /* scalar_stmt_cost. */
1005 4, /* scalar load_cost. */
1006 4, /* scalar_store_cost. */
1007 6, /* vec_stmt_cost. */
1008 0, /* vec_to_scalar_cost. */
1009 2, /* scalar_to_vec_cost. */
1010 4, /* vec_align_load_cost. */
1011 4, /* vec_unalign_load_cost. */
1012 4, /* vec_store_cost. */
1013 2, /* cond_taken_branch_cost. */
1014 1, /* cond_not_taken_branch_cost. */
1017 struct processor_costs bdver3_cost = {
1018 COSTS_N_INSNS (1), /* cost of an add instruction */
1019 COSTS_N_INSNS (1), /* cost of a lea instruction */
1020 COSTS_N_INSNS (1), /* variable shift costs */
1021 COSTS_N_INSNS (1), /* constant shift costs */
1022 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1023 COSTS_N_INSNS (4), /* HI */
1024 COSTS_N_INSNS (4), /* SI */
1025 COSTS_N_INSNS (6), /* DI */
1026 COSTS_N_INSNS (6)}, /* other */
1027 0, /* cost of multiply per each bit set */
1028 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1029 COSTS_N_INSNS (35), /* HI */
1030 COSTS_N_INSNS (51), /* SI */
1031 COSTS_N_INSNS (83), /* DI */
1032 COSTS_N_INSNS (83)}, /* other */
1033 COSTS_N_INSNS (1), /* cost of movsx */
1034 COSTS_N_INSNS (1), /* cost of movzx */
1035 8, /* "large" insn */
1036 9, /* MOVE_RATIO */
1037 4, /* cost for loading QImode using movzbl */
1038 {5, 5, 4}, /* cost of loading integer registers
1039 in QImode, HImode and SImode.
1040 Relative to reg-reg move (2). */
1041 {4, 4, 4}, /* cost of storing integer registers */
1042 2, /* cost of reg,reg fld/fst */
1043 {5, 5, 12}, /* cost of loading fp registers
1044 in SFmode, DFmode and XFmode */
1045 {4, 4, 8}, /* cost of storing fp registers
1046 in SFmode, DFmode and XFmode */
1047 2, /* cost of moving MMX register */
1048 {4, 4}, /* cost of loading MMX registers
1049 in SImode and DImode */
1050 {4, 4}, /* cost of storing MMX registers
1051 in SImode and DImode */
1052 2, /* cost of moving SSE register */
1053 {4, 4, 4}, /* cost of loading SSE registers
1054 in SImode, DImode and TImode */
1055 {4, 4, 4}, /* cost of storing SSE registers
1056 in SImode, DImode and TImode */
1057 2, /* MMX or SSE register to integer */
1058 16, /* size of l1 cache. */
1059 2048, /* size of l2 cache. */
1060 64, /* size of prefetch block */
1061 /* New AMD processors never drop prefetches; if they cannot be performed
1062 immediately, they are queued. We set number of simultaneous prefetches
1063 to a large constant to reflect this (it probably is not a good idea not
1064 to limit number of prefetches at all, as their execution also takes some
1065 time). */
1066 100, /* number of parallel prefetches */
1067 2, /* Branch cost */
1068 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1069 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1070 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1071 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1072 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1073 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1075 /* BDVER3 has optimized REP instruction for medium sized blocks, but for
1076 very small blocks it is better to use loop. For large blocks, libcall
1077 can do nontemporary accesses and beat inline considerably. */
1078 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
1079 {-1, rep_prefix_4_byte, false}}},
1080 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1081 {-1, libcall, false}}}},
1082 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
1083 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1084 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1085 {-1, libcall, false}}}},
1086 6, /* scalar_stmt_cost. */
1087 4, /* scalar load_cost. */
1088 4, /* scalar_store_cost. */
1089 6, /* vec_stmt_cost. */
1090 0, /* vec_to_scalar_cost. */
1091 2, /* scalar_to_vec_cost. */
1092 4, /* vec_align_load_cost. */
1093 4, /* vec_unalign_load_cost. */
1094 4, /* vec_store_cost. */
1095 2, /* cond_taken_branch_cost. */
1096 1, /* cond_not_taken_branch_cost. */
1099 struct processor_costs btver1_cost = {
1100 COSTS_N_INSNS (1), /* cost of an add instruction */
1101 COSTS_N_INSNS (2), /* cost of a lea instruction */
1102 COSTS_N_INSNS (1), /* variable shift costs */
1103 COSTS_N_INSNS (1), /* constant shift costs */
1104 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1105 COSTS_N_INSNS (4), /* HI */
1106 COSTS_N_INSNS (3), /* SI */
1107 COSTS_N_INSNS (4), /* DI */
1108 COSTS_N_INSNS (5)}, /* other */
1109 0, /* cost of multiply per each bit set */
1110 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1111 COSTS_N_INSNS (35), /* HI */
1112 COSTS_N_INSNS (51), /* SI */
1113 COSTS_N_INSNS (83), /* DI */
1114 COSTS_N_INSNS (83)}, /* other */
1115 COSTS_N_INSNS (1), /* cost of movsx */
1116 COSTS_N_INSNS (1), /* cost of movzx */
1117 8, /* "large" insn */
1118 9, /* MOVE_RATIO */
1119 4, /* cost for loading QImode using movzbl */
1120 {3, 4, 3}, /* cost of loading integer registers
1121 in QImode, HImode and SImode.
1122 Relative to reg-reg move (2). */
1123 {3, 4, 3}, /* cost of storing integer registers */
1124 4, /* cost of reg,reg fld/fst */
1125 {4, 4, 12}, /* cost of loading fp registers
1126 in SFmode, DFmode and XFmode */
1127 {6, 6, 8}, /* cost of storing fp registers
1128 in SFmode, DFmode and XFmode */
1129 2, /* cost of moving MMX register */
1130 {3, 3}, /* cost of loading MMX registers
1131 in SImode and DImode */
1132 {4, 4}, /* cost of storing MMX registers
1133 in SImode and DImode */
1134 2, /* cost of moving SSE register */
1135 {4, 4, 3}, /* cost of loading SSE registers
1136 in SImode, DImode and TImode */
1137 {4, 4, 5}, /* cost of storing SSE registers
1138 in SImode, DImode and TImode */
1139 3, /* MMX or SSE register to integer */
1140 /* On K8:
1141 MOVD reg64, xmmreg Double FSTORE 4
1142 MOVD reg32, xmmreg Double FSTORE 4
1143 On AMDFAM10:
1144 MOVD reg64, xmmreg Double FADD 3
1145 1/1 1/1
1146 MOVD reg32, xmmreg Double FADD 3
1147 1/1 1/1 */
1148 32, /* size of l1 cache. */
1149 512, /* size of l2 cache. */
1150 64, /* size of prefetch block */
1151 100, /* number of parallel prefetches */
1152 2, /* Branch cost */
1153 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1154 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1155 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1156 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1157 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1158 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1160 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1161 very small blocks it is better to use loop. For large blocks, libcall can
1162 do nontemporary accesses and beat inline considerably. */
1163 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
1164 {-1, rep_prefix_4_byte, false}}},
1165 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1166 {-1, libcall, false}}}},
1167 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
1168 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1169 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1170 {-1, libcall, false}}}},
1171 4, /* scalar_stmt_cost. */
1172 2, /* scalar load_cost. */
1173 2, /* scalar_store_cost. */
1174 6, /* vec_stmt_cost. */
1175 0, /* vec_to_scalar_cost. */
1176 2, /* scalar_to_vec_cost. */
1177 2, /* vec_align_load_cost. */
1178 2, /* vec_unalign_load_cost. */
1179 2, /* vec_store_cost. */
1180 2, /* cond_taken_branch_cost. */
1181 1, /* cond_not_taken_branch_cost. */
1184 struct processor_costs btver2_cost = {
1185 COSTS_N_INSNS (1), /* cost of an add instruction */
1186 COSTS_N_INSNS (2), /* cost of a lea instruction */
1187 COSTS_N_INSNS (1), /* variable shift costs */
1188 COSTS_N_INSNS (1), /* constant shift costs */
1189 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1190 COSTS_N_INSNS (4), /* HI */
1191 COSTS_N_INSNS (3), /* SI */
1192 COSTS_N_INSNS (4), /* DI */
1193 COSTS_N_INSNS (5)}, /* other */
1194 0, /* cost of multiply per each bit set */
1195 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1196 COSTS_N_INSNS (35), /* HI */
1197 COSTS_N_INSNS (51), /* SI */
1198 COSTS_N_INSNS (83), /* DI */
1199 COSTS_N_INSNS (83)}, /* other */
1200 COSTS_N_INSNS (1), /* cost of movsx */
1201 COSTS_N_INSNS (1), /* cost of movzx */
1202 8, /* "large" insn */
1203 9, /* MOVE_RATIO */
1204 4, /* cost for loading QImode using movzbl */
1205 {3, 4, 3}, /* cost of loading integer registers
1206 in QImode, HImode and SImode.
1207 Relative to reg-reg move (2). */
1208 {3, 4, 3}, /* cost of storing integer registers */
1209 4, /* cost of reg,reg fld/fst */
1210 {4, 4, 12}, /* cost of loading fp registers
1211 in SFmode, DFmode and XFmode */
1212 {6, 6, 8}, /* cost of storing fp registers
1213 in SFmode, DFmode and XFmode */
1214 2, /* cost of moving MMX register */
1215 {3, 3}, /* cost of loading MMX registers
1216 in SImode and DImode */
1217 {4, 4}, /* cost of storing MMX registers
1218 in SImode and DImode */
1219 2, /* cost of moving SSE register */
1220 {4, 4, 3}, /* cost of loading SSE registers
1221 in SImode, DImode and TImode */
1222 {4, 4, 5}, /* cost of storing SSE registers
1223 in SImode, DImode and TImode */
1224 3, /* MMX or SSE register to integer */
1225 /* On K8:
1226 MOVD reg64, xmmreg Double FSTORE 4
1227 MOVD reg32, xmmreg Double FSTORE 4
1228 On AMDFAM10:
1229 MOVD reg64, xmmreg Double FADD 3
1230 1/1 1/1
1231 MOVD reg32, xmmreg Double FADD 3
1232 1/1 1/1 */
1233 32, /* size of l1 cache. */
1234 2048, /* size of l2 cache. */
1235 64, /* size of prefetch block */
1236 100, /* number of parallel prefetches */
1237 2, /* Branch cost */
1238 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1239 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1240 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1241 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1242 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1243 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1245 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
1246 {-1, rep_prefix_4_byte, false}}},
1247 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1248 {-1, libcall, false}}}},
1249 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
1250 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1251 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1252 {-1, libcall, false}}}},
1253 4, /* scalar_stmt_cost. */
1254 2, /* scalar load_cost. */
1255 2, /* scalar_store_cost. */
1256 6, /* vec_stmt_cost. */
1257 0, /* vec_to_scalar_cost. */
1258 2, /* scalar_to_vec_cost. */
1259 2, /* vec_align_load_cost. */
1260 2, /* vec_unalign_load_cost. */
1261 2, /* vec_store_cost. */
1262 2, /* cond_taken_branch_cost. */
1263 1, /* cond_not_taken_branch_cost. */
1266 static const
1267 struct processor_costs pentium4_cost = {
1268 COSTS_N_INSNS (1), /* cost of an add instruction */
1269 COSTS_N_INSNS (3), /* cost of a lea instruction */
1270 COSTS_N_INSNS (4), /* variable shift costs */
1271 COSTS_N_INSNS (4), /* constant shift costs */
1272 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1273 COSTS_N_INSNS (15), /* HI */
1274 COSTS_N_INSNS (15), /* SI */
1275 COSTS_N_INSNS (15), /* DI */
1276 COSTS_N_INSNS (15)}, /* other */
1277 0, /* cost of multiply per each bit set */
1278 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1279 COSTS_N_INSNS (56), /* HI */
1280 COSTS_N_INSNS (56), /* SI */
1281 COSTS_N_INSNS (56), /* DI */
1282 COSTS_N_INSNS (56)}, /* other */
1283 COSTS_N_INSNS (1), /* cost of movsx */
1284 COSTS_N_INSNS (1), /* cost of movzx */
1285 16, /* "large" insn */
1286 6, /* MOVE_RATIO */
1287 2, /* cost for loading QImode using movzbl */
1288 {4, 5, 4}, /* cost of loading integer registers
1289 in QImode, HImode and SImode.
1290 Relative to reg-reg move (2). */
1291 {2, 3, 2}, /* cost of storing integer registers */
1292 2, /* cost of reg,reg fld/fst */
1293 {2, 2, 6}, /* cost of loading fp registers
1294 in SFmode, DFmode and XFmode */
1295 {4, 4, 6}, /* cost of storing fp registers
1296 in SFmode, DFmode and XFmode */
1297 2, /* cost of moving MMX register */
1298 {2, 2}, /* cost of loading MMX registers
1299 in SImode and DImode */
1300 {2, 2}, /* cost of storing MMX registers
1301 in SImode and DImode */
1302 12, /* cost of moving SSE register */
1303 {12, 12, 12}, /* cost of loading SSE registers
1304 in SImode, DImode and TImode */
1305 {2, 2, 8}, /* cost of storing SSE registers
1306 in SImode, DImode and TImode */
1307 10, /* MMX or SSE register to integer */
1308 8, /* size of l1 cache. */
1309 256, /* size of l2 cache. */
1310 64, /* size of prefetch block */
1311 6, /* number of parallel prefetches */
1312 2, /* Branch cost */
1313 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1314 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1315 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1316 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1317 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1318 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1319 {{libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1320 DUMMY_STRINGOP_ALGS},
1321 {{libcall, {{6, loop_1_byte, false}, {48, loop, false},
1322 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1323 DUMMY_STRINGOP_ALGS},
1324 1, /* scalar_stmt_cost. */
1325 1, /* scalar load_cost. */
1326 1, /* scalar_store_cost. */
1327 1, /* vec_stmt_cost. */
1328 1, /* vec_to_scalar_cost. */
1329 1, /* scalar_to_vec_cost. */
1330 1, /* vec_align_load_cost. */
1331 2, /* vec_unalign_load_cost. */
1332 1, /* vec_store_cost. */
1333 3, /* cond_taken_branch_cost. */
1334 1, /* cond_not_taken_branch_cost. */
1337 static const
1338 struct processor_costs nocona_cost = {
1339 COSTS_N_INSNS (1), /* cost of an add instruction */
1340 COSTS_N_INSNS (1), /* cost of a lea instruction */
1341 COSTS_N_INSNS (1), /* variable shift costs */
1342 COSTS_N_INSNS (1), /* constant shift costs */
1343 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1344 COSTS_N_INSNS (10), /* HI */
1345 COSTS_N_INSNS (10), /* SI */
1346 COSTS_N_INSNS (10), /* DI */
1347 COSTS_N_INSNS (10)}, /* other */
1348 0, /* cost of multiply per each bit set */
1349 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1350 COSTS_N_INSNS (66), /* HI */
1351 COSTS_N_INSNS (66), /* SI */
1352 COSTS_N_INSNS (66), /* DI */
1353 COSTS_N_INSNS (66)}, /* other */
1354 COSTS_N_INSNS (1), /* cost of movsx */
1355 COSTS_N_INSNS (1), /* cost of movzx */
1356 16, /* "large" insn */
1357 17, /* MOVE_RATIO */
1358 4, /* cost for loading QImode using movzbl */
1359 {4, 4, 4}, /* cost of loading integer registers
1360 in QImode, HImode and SImode.
1361 Relative to reg-reg move (2). */
1362 {4, 4, 4}, /* cost of storing integer registers */
1363 3, /* cost of reg,reg fld/fst */
1364 {12, 12, 12}, /* cost of loading fp registers
1365 in SFmode, DFmode and XFmode */
1366 {4, 4, 4}, /* cost of storing fp registers
1367 in SFmode, DFmode and XFmode */
1368 6, /* cost of moving MMX register */
1369 {12, 12}, /* cost of loading MMX registers
1370 in SImode and DImode */
1371 {12, 12}, /* cost of storing MMX registers
1372 in SImode and DImode */
1373 6, /* cost of moving SSE register */
1374 {12, 12, 12}, /* cost of loading SSE registers
1375 in SImode, DImode and TImode */
1376 {12, 12, 12}, /* cost of storing SSE registers
1377 in SImode, DImode and TImode */
1378 8, /* MMX or SSE register to integer */
1379 8, /* size of l1 cache. */
1380 1024, /* size of l2 cache. */
1381 128, /* size of prefetch block */
1382 8, /* number of parallel prefetches */
1383 1, /* Branch cost */
1384 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1385 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1386 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1387 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1388 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1389 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1390 {{libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1391 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1392 {100000, unrolled_loop, false}, {-1, libcall, false}}}},
1393 {{libcall, {{6, loop_1_byte, false}, {48, loop, false},
1394 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1395 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1396 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
1397 1, /* scalar_stmt_cost. */
1398 1, /* scalar load_cost. */
1399 1, /* scalar_store_cost. */
1400 1, /* vec_stmt_cost. */
1401 1, /* vec_to_scalar_cost. */
1402 1, /* scalar_to_vec_cost. */
1403 1, /* vec_align_load_cost. */
1404 2, /* vec_unalign_load_cost. */
1405 1, /* vec_store_cost. */
1406 3, /* cond_taken_branch_cost. */
1407 1, /* cond_not_taken_branch_cost. */
1410 static const
1411 struct processor_costs atom_cost = {
1412 COSTS_N_INSNS (1), /* cost of an add instruction */
1413 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1414 COSTS_N_INSNS (1), /* variable shift costs */
1415 COSTS_N_INSNS (1), /* constant shift costs */
1416 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1417 COSTS_N_INSNS (4), /* HI */
1418 COSTS_N_INSNS (3), /* SI */
1419 COSTS_N_INSNS (4), /* DI */
1420 COSTS_N_INSNS (2)}, /* other */
1421 0, /* cost of multiply per each bit set */
1422 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1423 COSTS_N_INSNS (26), /* HI */
1424 COSTS_N_INSNS (42), /* SI */
1425 COSTS_N_INSNS (74), /* DI */
1426 COSTS_N_INSNS (74)}, /* other */
1427 COSTS_N_INSNS (1), /* cost of movsx */
1428 COSTS_N_INSNS (1), /* cost of movzx */
1429 8, /* "large" insn */
1430 17, /* MOVE_RATIO */
1431 4, /* cost for loading QImode using movzbl */
1432 {4, 4, 4}, /* cost of loading integer registers
1433 in QImode, HImode and SImode.
1434 Relative to reg-reg move (2). */
1435 {4, 4, 4}, /* cost of storing integer registers */
1436 4, /* cost of reg,reg fld/fst */
1437 {12, 12, 12}, /* cost of loading fp registers
1438 in SFmode, DFmode and XFmode */
1439 {6, 6, 8}, /* cost of storing fp registers
1440 in SFmode, DFmode and XFmode */
1441 2, /* cost of moving MMX register */
1442 {8, 8}, /* cost of loading MMX registers
1443 in SImode and DImode */
1444 {8, 8}, /* cost of storing MMX registers
1445 in SImode and DImode */
1446 2, /* cost of moving SSE register */
1447 {8, 8, 8}, /* cost of loading SSE registers
1448 in SImode, DImode and TImode */
1449 {8, 8, 8}, /* cost of storing SSE registers
1450 in SImode, DImode and TImode */
1451 5, /* MMX or SSE register to integer */
1452 32, /* size of l1 cache. */
1453 256, /* size of l2 cache. */
1454 64, /* size of prefetch block */
1455 6, /* number of parallel prefetches */
1456 3, /* Branch cost */
1457 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1458 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1459 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1460 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1461 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1462 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1463 {{libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1464 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1465 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
1466 {{libcall, {{8, loop, false}, {15, unrolled_loop, false},
1467 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1468 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1469 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
1470 1, /* scalar_stmt_cost. */
1471 1, /* scalar load_cost. */
1472 1, /* scalar_store_cost. */
1473 1, /* vec_stmt_cost. */
1474 1, /* vec_to_scalar_cost. */
1475 1, /* scalar_to_vec_cost. */
1476 1, /* vec_align_load_cost. */
1477 2, /* vec_unalign_load_cost. */
1478 1, /* vec_store_cost. */
1479 3, /* cond_taken_branch_cost. */
1480 1, /* cond_not_taken_branch_cost. */
1483 /* Generic64 should produce code tuned for Nocona and K8. */
1484 static const
1485 struct processor_costs generic64_cost = {
1486 COSTS_N_INSNS (1), /* cost of an add instruction */
1487 /* On all chips taken into consideration lea is 2 cycles and more. With
1488 this cost however our current implementation of synth_mult results in
1489 use of unnecessary temporary registers causing regression on several
1490 SPECfp benchmarks. */
1491 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1492 COSTS_N_INSNS (1), /* variable shift costs */
1493 COSTS_N_INSNS (1), /* constant shift costs */
1494 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1495 COSTS_N_INSNS (4), /* HI */
1496 COSTS_N_INSNS (3), /* SI */
1497 COSTS_N_INSNS (4), /* DI */
1498 COSTS_N_INSNS (2)}, /* other */
1499 0, /* cost of multiply per each bit set */
1500 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1501 COSTS_N_INSNS (26), /* HI */
1502 COSTS_N_INSNS (42), /* SI */
1503 COSTS_N_INSNS (74), /* DI */
1504 COSTS_N_INSNS (74)}, /* other */
1505 COSTS_N_INSNS (1), /* cost of movsx */
1506 COSTS_N_INSNS (1), /* cost of movzx */
1507 8, /* "large" insn */
1508 17, /* MOVE_RATIO */
1509 4, /* cost for loading QImode using movzbl */
1510 {4, 4, 4}, /* cost of loading integer registers
1511 in QImode, HImode and SImode.
1512 Relative to reg-reg move (2). */
1513 {4, 4, 4}, /* cost of storing integer registers */
1514 4, /* cost of reg,reg fld/fst */
1515 {12, 12, 12}, /* cost of loading fp registers
1516 in SFmode, DFmode and XFmode */
1517 {6, 6, 8}, /* cost of storing fp registers
1518 in SFmode, DFmode and XFmode */
1519 2, /* cost of moving MMX register */
1520 {8, 8}, /* cost of loading MMX registers
1521 in SImode and DImode */
1522 {8, 8}, /* cost of storing MMX registers
1523 in SImode and DImode */
1524 2, /* cost of moving SSE register */
1525 {8, 8, 8}, /* cost of loading SSE registers
1526 in SImode, DImode and TImode */
1527 {8, 8, 8}, /* cost of storing SSE registers
1528 in SImode, DImode and TImode */
1529 5, /* MMX or SSE register to integer */
1530 32, /* size of l1 cache. */
1531 512, /* size of l2 cache. */
1532 64, /* size of prefetch block */
1533 6, /* number of parallel prefetches */
1534 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1535 value is increased to perhaps more appropriate value of 5. */
1536 3, /* Branch cost */
1537 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1538 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1539 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1540 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1541 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1542 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1543 {DUMMY_STRINGOP_ALGS,
1544 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1545 {-1, libcall, false}}}},
1546 {DUMMY_STRINGOP_ALGS,
1547 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1548 {-1, libcall, false}}}},
1549 1, /* scalar_stmt_cost. */
1550 1, /* scalar load_cost. */
1551 1, /* scalar_store_cost. */
1552 1, /* vec_stmt_cost. */
1553 1, /* vec_to_scalar_cost. */
1554 1, /* scalar_to_vec_cost. */
1555 1, /* vec_align_load_cost. */
1556 2, /* vec_unalign_load_cost. */
1557 1, /* vec_store_cost. */
1558 3, /* cond_taken_branch_cost. */
1559 1, /* cond_not_taken_branch_cost. */
1562 /* core_cost should produce code tuned for Core familly of CPUs. */
1563 static const
1564 struct processor_costs core_cost = {
1565 COSTS_N_INSNS (1), /* cost of an add instruction */
1566 /* On all chips taken into consideration lea is 2 cycles and more. With
1567 this cost however our current implementation of synth_mult results in
1568 use of unnecessary temporary registers causing regression on several
1569 SPECfp benchmarks. */
1570 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1571 COSTS_N_INSNS (1), /* variable shift costs */
1572 COSTS_N_INSNS (1), /* constant shift costs */
1573 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1574 COSTS_N_INSNS (4), /* HI */
1575 COSTS_N_INSNS (3), /* SI */
1576 COSTS_N_INSNS (4), /* DI */
1577 COSTS_N_INSNS (2)}, /* other */
1578 0, /* cost of multiply per each bit set */
1579 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1580 COSTS_N_INSNS (26), /* HI */
1581 COSTS_N_INSNS (42), /* SI */
1582 COSTS_N_INSNS (74), /* DI */
1583 COSTS_N_INSNS (74)}, /* other */
1584 COSTS_N_INSNS (1), /* cost of movsx */
1585 COSTS_N_INSNS (1), /* cost of movzx */
1586 8, /* "large" insn */
1587 17, /* MOVE_RATIO */
1588 4, /* cost for loading QImode using movzbl */
1589 {4, 4, 4}, /* cost of loading integer registers
1590 in QImode, HImode and SImode.
1591 Relative to reg-reg move (2). */
1592 {4, 4, 4}, /* cost of storing integer registers */
1593 4, /* cost of reg,reg fld/fst */
1594 {12, 12, 12}, /* cost of loading fp registers
1595 in SFmode, DFmode and XFmode */
1596 {6, 6, 8}, /* cost of storing fp registers
1597 in SFmode, DFmode and XFmode */
1598 2, /* cost of moving MMX register */
1599 {8, 8}, /* cost of loading MMX registers
1600 in SImode and DImode */
1601 {8, 8}, /* cost of storing MMX registers
1602 in SImode and DImode */
1603 2, /* cost of moving SSE register */
1604 {8, 8, 8}, /* cost of loading SSE registers
1605 in SImode, DImode and TImode */
1606 {8, 8, 8}, /* cost of storing SSE registers
1607 in SImode, DImode and TImode */
1608 5, /* MMX or SSE register to integer */
1609 64, /* size of l1 cache. */
1610 512, /* size of l2 cache. */
1611 64, /* size of prefetch block */
1612 6, /* number of parallel prefetches */
1613 /* FIXME perhaps more appropriate value is 5. */
1614 3, /* Branch cost */
1615 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1616 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1617 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1618 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1619 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1620 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1621 {{libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
1622 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
1623 {-1, libcall, false}}}},
1624 {{libcall, {{6, loop_1_byte, true},
1625 {24, loop, true},
1626 {8192, rep_prefix_4_byte, true},
1627 {-1, libcall, false}}},
1628 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
1629 {-1, libcall, false}}}},
1630 1, /* scalar_stmt_cost. */
1631 1, /* scalar load_cost. */
1632 1, /* scalar_store_cost. */
1633 1, /* vec_stmt_cost. */
1634 1, /* vec_to_scalar_cost. */
1635 1, /* scalar_to_vec_cost. */
1636 1, /* vec_align_load_cost. */
1637 2, /* vec_unalign_load_cost. */
1638 1, /* vec_store_cost. */
1639 3, /* cond_taken_branch_cost. */
1640 1, /* cond_not_taken_branch_cost. */
1643 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
1644 Athlon and K8. */
1645 static const
1646 struct processor_costs generic32_cost = {
1647 COSTS_N_INSNS (1), /* cost of an add instruction */
1648 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1649 COSTS_N_INSNS (1), /* variable shift costs */
1650 COSTS_N_INSNS (1), /* constant shift costs */
1651 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1652 COSTS_N_INSNS (4), /* HI */
1653 COSTS_N_INSNS (3), /* SI */
1654 COSTS_N_INSNS (4), /* DI */
1655 COSTS_N_INSNS (2)}, /* other */
1656 0, /* cost of multiply per each bit set */
1657 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1658 COSTS_N_INSNS (26), /* HI */
1659 COSTS_N_INSNS (42), /* SI */
1660 COSTS_N_INSNS (74), /* DI */
1661 COSTS_N_INSNS (74)}, /* other */
1662 COSTS_N_INSNS (1), /* cost of movsx */
1663 COSTS_N_INSNS (1), /* cost of movzx */
1664 8, /* "large" insn */
1665 17, /* MOVE_RATIO */
1666 4, /* cost for loading QImode using movzbl */
1667 {4, 4, 4}, /* cost of loading integer registers
1668 in QImode, HImode and SImode.
1669 Relative to reg-reg move (2). */
1670 {4, 4, 4}, /* cost of storing integer registers */
1671 4, /* cost of reg,reg fld/fst */
1672 {12, 12, 12}, /* cost of loading fp registers
1673 in SFmode, DFmode and XFmode */
1674 {6, 6, 8}, /* cost of storing fp registers
1675 in SFmode, DFmode and XFmode */
1676 2, /* cost of moving MMX register */
1677 {8, 8}, /* cost of loading MMX registers
1678 in SImode and DImode */
1679 {8, 8}, /* cost of storing MMX registers
1680 in SImode and DImode */
1681 2, /* cost of moving SSE register */
1682 {8, 8, 8}, /* cost of loading SSE registers
1683 in SImode, DImode and TImode */
1684 {8, 8, 8}, /* cost of storing SSE registers
1685 in SImode, DImode and TImode */
1686 5, /* MMX or SSE register to integer */
1687 32, /* size of l1 cache. */
1688 256, /* size of l2 cache. */
1689 64, /* size of prefetch block */
1690 6, /* number of parallel prefetches */
1691 3, /* Branch cost */
1692 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1693 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1694 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1695 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1696 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1697 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1698 {{libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1699 {-1, libcall, false}}},
1700 DUMMY_STRINGOP_ALGS},
1701 {{libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1702 {-1, libcall, false}}},
1703 DUMMY_STRINGOP_ALGS},
1704 1, /* scalar_stmt_cost. */
1705 1, /* scalar load_cost. */
1706 1, /* scalar_store_cost. */
1707 1, /* vec_stmt_cost. */
1708 1, /* vec_to_scalar_cost. */
1709 1, /* scalar_to_vec_cost. */
1710 1, /* vec_align_load_cost. */
1711 2, /* vec_unalign_load_cost. */
1712 1, /* vec_store_cost. */
1713 3, /* cond_taken_branch_cost. */
1714 1, /* cond_not_taken_branch_cost. */
1717 /* Set by -mtune. */
1718 const struct processor_costs *ix86_tune_cost = &pentium_cost;
1720 /* Set by -mtune or -Os. */
1721 const struct processor_costs *ix86_cost = &pentium_cost;
1723 /* Processor feature/optimization bitmasks. */
1724 #define m_386 (1<<PROCESSOR_I386)
1725 #define m_486 (1<<PROCESSOR_I486)
1726 #define m_PENT (1<<PROCESSOR_PENTIUM)
1727 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1728 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1729 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1730 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1731 #define m_CORE2 (1<<PROCESSOR_CORE2)
1732 #define m_COREI7 (1<<PROCESSOR_COREI7)
1733 #define m_HASWELL (1<<PROCESSOR_HASWELL)
1734 #define m_CORE_ALL (m_CORE2 | m_COREI7 | m_HASWELL)
1735 #define m_ATOM (1<<PROCESSOR_ATOM)
1737 #define m_GEODE (1<<PROCESSOR_GEODE)
1738 #define m_K6 (1<<PROCESSOR_K6)
1739 #define m_K6_GEODE (m_K6 | m_GEODE)
1740 #define m_K8 (1<<PROCESSOR_K8)
1741 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1742 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1743 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1744 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1745 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1746 #define m_BDVER3 (1<<PROCESSOR_BDVER3)
1747 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1748 #define m_BTVER2 (1<<PROCESSOR_BTVER2)
1749 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3)
1750 #define m_BTVER (m_BTVER1 | m_BTVER2)
1751 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER)
1753 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1754 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1756 /* Generic instruction choice should be common subset of supported CPUs
1757 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
1758 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
1760 /* Feature tests against the various tunings. */
1761 unsigned char ix86_tune_features[X86_TUNE_LAST];
1763 /* Feature tests against the various tunings used to create ix86_tune_features
1764 based on the processor mask. */
1765 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
1766 /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
1767 negatively, so enabling for Generic64 seems like good code size
1768 tradeoff. We can't enable it for 32bit generic because it does not
1769 work well with PPro base chips. */
1770 m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
1772 /* X86_TUNE_PUSH_MEMORY */
1773 m_386 | m_P4_NOCONA | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1775 /* X86_TUNE_ZERO_EXTEND_WITH_AND */
1776 m_486 | m_PENT,
1778 /* X86_TUNE_UNROLL_STRLEN */
1779 m_486 | m_PENT | m_PPRO | m_ATOM | m_CORE_ALL | m_K6 | m_AMD_MULTIPLE | m_GENERIC,
1781 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
1782 on simulation result. But after P4 was made, no performance benefit
1783 was observed with branch hints. It also increases the code size.
1784 As a result, icc never generates branch hints. */
1787 /* X86_TUNE_DOUBLE_WITH_ADD */
1788 ~m_386,
1790 /* X86_TUNE_USE_SAHF */
1791 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC,
1793 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
1794 partial dependencies. */
1795 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1797 /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
1798 register stalls on Generic32 compilation setting as well. However
1799 in current implementation the partial register stalls are not eliminated
1800 very well - they can be introduced via subregs synthesized by combine
1801 and can happen in caller/callee saving sequences. Because this option
1802 pays back little on PPro based chips and is in conflict with partial reg
1803 dependencies used by Athlon/P4 based chips, it is better to leave it off
1804 for generic32 for now. */
1805 m_PPRO,
1807 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
1808 m_CORE_ALL | m_GENERIC,
1810 /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
1811 * on 16-bit immediate moves into memory on Core2 and Corei7. */
1812 m_CORE_ALL | m_GENERIC,
1814 /* X86_TUNE_USE_HIMODE_FIOP */
1815 m_386 | m_486 | m_K6_GEODE,
1817 /* X86_TUNE_USE_SIMODE_FIOP */
1818 ~(m_PENT | m_PPRO | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC),
1820 /* X86_TUNE_USE_MOV0 */
1821 m_K6,
1823 /* X86_TUNE_USE_CLTD */
1824 ~(m_PENT | m_ATOM | m_K6),
1826 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
1827 m_PENT4,
1829 /* X86_TUNE_SPLIT_LONG_MOVES */
1830 m_PPRO,
1832 /* X86_TUNE_READ_MODIFY_WRITE */
1833 ~m_PENT,
1835 /* X86_TUNE_READ_MODIFY */
1836 ~(m_PENT | m_PPRO),
1838 /* X86_TUNE_PROMOTE_QIMODE */
1839 m_386 | m_486 | m_PENT | m_CORE_ALL | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1841 /* X86_TUNE_FAST_PREFIX */
1842 ~(m_386 | m_486 | m_PENT),
1844 /* X86_TUNE_SINGLE_STRINGOP */
1845 m_386 | m_P4_NOCONA,
1847 /* X86_TUNE_QIMODE_MATH */
1850 /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
1851 register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option
1852 might be considered for Generic32 if our scheme for avoiding partial
1853 stalls was more effective. */
1854 ~m_PPRO,
1856 /* X86_TUNE_PROMOTE_QI_REGS */
1859 /* X86_TUNE_PROMOTE_HI_REGS */
1860 m_PPRO,
1862 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
1863 over esp addition. */
1864 m_386 | m_486 | m_PENT | m_PPRO,
1866 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
1867 over esp addition. */
1868 m_PENT,
1870 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
1871 over esp subtraction. */
1872 m_386 | m_486 | m_PENT | m_K6_GEODE,
1874 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
1875 over esp subtraction. */
1876 m_PENT | m_K6_GEODE,
1878 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
1879 for DFmode copies */
1880 ~(m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_ATOM | m_GENERIC),
1882 /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
1883 m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
1885 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
1886 conflict here in between PPro/Pentium4 based chips that thread 128bit
1887 SSE registers as single units versus K8 based chips that divide SSE
1888 registers to two 64bit halves. This knob promotes all store destinations
1889 to be 128bit to allow register renaming on 128bit SSE units, but usually
1890 results in one extra microop on 64bit SSE units. Experimental results
1891 shows that disabling this option on P4 brings over 20% SPECfp regression,
1892 while enabling it on K8 brings roughly 2.4% regression that can be partly
1893 masked by careful scheduling of moves. */
1894 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMDFAM10 | m_BDVER | m_GENERIC,
1896 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
1897 m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER,
1899 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */
1900 m_COREI7 | m_BDVER,
1902 /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */
1903 m_BDVER ,
1905 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
1906 are resolved on SSE register parts instead of whole registers, so we may
1907 maintain just lower part of scalar values in proper format leaving the
1908 upper part undefined. */
1909 m_ATHLON_K8,
1911 /* X86_TUNE_SSE_TYPELESS_STORES */
1912 m_AMD_MULTIPLE,
1914 /* X86_TUNE_SSE_LOAD0_BY_PXOR */
1915 m_PPRO | m_P4_NOCONA,
1917 /* X86_TUNE_MEMORY_MISMATCH_STALL */
1918 m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
1920 /* X86_TUNE_PROLOGUE_USING_MOVE */
1921 m_PPRO | m_ATHLON_K8,
1923 /* X86_TUNE_EPILOGUE_USING_MOVE */
1924 m_PPRO | m_ATHLON_K8,
1926 /* X86_TUNE_SHIFT1 */
1927 ~m_486,
1929 /* X86_TUNE_USE_FFREEP */
1930 m_AMD_MULTIPLE,
1932 /* X86_TUNE_INTER_UNIT_MOVES */
1933 ~(m_AMD_MULTIPLE | m_GENERIC),
1935 /* X86_TUNE_INTER_UNIT_CONVERSIONS */
1936 ~(m_AMDFAM10 | m_BDVER ),
1938 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
1939 than 4 branch instructions in the 16 byte window. */
1940 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
1942 /* X86_TUNE_SCHEDULE */
1943 m_PENT | m_PPRO | m_CORE_ALL | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1945 /* X86_TUNE_USE_BT */
1946 m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
1948 /* X86_TUNE_USE_INCDEC */
1949 ~(m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_GENERIC),
1951 /* X86_TUNE_PAD_RETURNS */
1952 m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC,
1954 /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */
1955 m_ATOM,
1957 /* X86_TUNE_EXT_80387_CONSTANTS */
1958 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC,
1960 /* X86_TUNE_AVOID_VECTOR_DECODE */
1961 m_CORE_ALL | m_K8 | m_GENERIC64,
1963 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
1964 and SImode multiply, but 386 and 486 do HImode multiply faster. */
1965 ~(m_386 | m_486),
1967 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
1968 vector path on AMD machines. */
1969 m_CORE_ALL | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
1971 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
1972 machines. */
1973 m_CORE_ALL | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
1975 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
1976 than a MOV. */
1977 m_PENT,
1979 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
1980 but one byte longer. */
1981 m_PENT,
1983 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
1984 operand that cannot be represented using a modRM byte. The XOR
1985 replacement is long decoded, so this split helps here as well. */
1986 m_K6,
1988 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
1989 from FP to FP. */
1990 m_CORE_ALL | m_AMDFAM10 | m_GENERIC,
1992 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
1993 from integer to FP. */
1994 m_AMDFAM10,
1996 /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
1997 with a subsequent conditional jump instruction into a single
1998 compare-and-branch uop. */
1999 m_BDVER,
2001 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
2002 will impact LEA instruction selection. */
2003 m_ATOM,
2005 /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
2006 instructions. */
2007 ~m_ATOM,
2009 /* X86_SOFTARE_PREFETCHING_BENEFICIAL: Enable software prefetching
2010 at -O3. For the moment, the prefetching seems badly tuned for Intel
2011 chips. */
2012 m_K6_GEODE | m_AMD_MULTIPLE,
2014 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
2015 the auto-vectorizer. */
2016 m_BDVER | m_BTVER2,
2018 /* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations
2019 during reassociation of integer computation. */
2020 m_ATOM,
2022 /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
2023 during reassociation of fp computation. */
2024 m_ATOM | m_HASWELL,
2026 /* X86_TUNE_GENERAL_REGS_SSE_SPILL: Try to spill general regs to SSE
2027 regs instead of memory. */
2028 m_CORE_ALL,
2030 /* X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE: Try to avoid memory operands for
2031 a conditional move. */
2032 m_ATOM
2035 /* Feature tests against the various architecture variations. */
2036 unsigned char ix86_arch_features[X86_ARCH_LAST];
2038 /* Feature tests against the various architecture variations, used to create
2039 ix86_arch_features based on the processor mask. */
2040 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2041 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
2042 ~(m_386 | m_486 | m_PENT | m_K6),
2044 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2045 ~m_386,
2047 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2048 ~(m_386 | m_486),
2050 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2051 ~m_386,
2053 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2054 ~m_386,
2057 static const unsigned int x86_accumulate_outgoing_args
2058 = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC;
2060 static const unsigned int x86_arch_always_fancy_math_387
2061 = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC;
2063 static const unsigned int x86_avx256_split_unaligned_load
2064 = m_COREI7 | m_GENERIC;
2066 static const unsigned int x86_avx256_split_unaligned_store
2067 = m_COREI7 | m_BDVER | m_GENERIC;
2069 /* In case the average insn count for single function invocation is
2070 lower than this constant, emit fast (but longer) prologue and
2071 epilogue code. */
2072 #define FAST_PROLOGUE_INSN_COUNT 20
2074 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2075 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2076 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2077 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2079 /* Array of the smallest class containing reg number REGNO, indexed by
2080 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2082 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2084 /* ax, dx, cx, bx */
2085 AREG, DREG, CREG, BREG,
2086 /* si, di, bp, sp */
2087 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2088 /* FP registers */
2089 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2090 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2091 /* arg pointer */
2092 NON_Q_REGS,
2093 /* flags, fpsr, fpcr, frame */
2094 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2095 /* SSE registers */
2096 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2097 SSE_REGS, SSE_REGS,
2098 /* MMX registers */
2099 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2100 MMX_REGS, MMX_REGS,
2101 /* REX registers */
2102 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2103 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2104 /* SSE REX registers */
2105 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2106 SSE_REGS, SSE_REGS,
2109 /* The "default" register map used in 32bit mode. */
2111 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2113 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2114 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2115 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2116 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2117 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2118 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2119 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2122 /* The "default" register map used in 64bit mode. */
2124 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2126 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2127 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2128 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2129 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2130 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2131 8,9,10,11,12,13,14,15, /* extended integer registers */
2132 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2135 /* Define the register numbers to be used in Dwarf debugging information.
2136 The SVR4 reference port C compiler uses the following register numbers
2137 in its Dwarf output code:
2138 0 for %eax (gcc regno = 0)
2139 1 for %ecx (gcc regno = 2)
2140 2 for %edx (gcc regno = 1)
2141 3 for %ebx (gcc regno = 3)
2142 4 for %esp (gcc regno = 7)
2143 5 for %ebp (gcc regno = 6)
2144 6 for %esi (gcc regno = 4)
2145 7 for %edi (gcc regno = 5)
2146 The following three DWARF register numbers are never generated by
2147 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2148 believes these numbers have these meanings.
2149 8 for %eip (no gcc equivalent)
2150 9 for %eflags (gcc regno = 17)
2151 10 for %trapno (no gcc equivalent)
2152 It is not at all clear how we should number the FP stack registers
2153 for the x86 architecture. If the version of SDB on x86/svr4 were
2154 a bit less brain dead with respect to floating-point then we would
2155 have a precedent to follow with respect to DWARF register numbers
2156 for x86 FP registers, but the SDB on x86/svr4 is so completely
2157 broken with respect to FP registers that it is hardly worth thinking
2158 of it as something to strive for compatibility with.
2159 The version of x86/svr4 SDB I have at the moment does (partially)
2160 seem to believe that DWARF register number 11 is associated with
2161 the x86 register %st(0), but that's about all. Higher DWARF
2162 register numbers don't seem to be associated with anything in
2163 particular, and even for DWARF regno 11, SDB only seems to under-
2164 stand that it should say that a variable lives in %st(0) (when
2165 asked via an `=' command) if we said it was in DWARF regno 11,
2166 but SDB still prints garbage when asked for the value of the
2167 variable in question (via a `/' command).
2168 (Also note that the labels SDB prints for various FP stack regs
2169 when doing an `x' command are all wrong.)
2170 Note that these problems generally don't affect the native SVR4
2171 C compiler because it doesn't allow the use of -O with -g and
2172 because when it is *not* optimizing, it allocates a memory
2173 location for each floating-point variable, and the memory
2174 location is what gets described in the DWARF AT_location
2175 attribute for the variable in question.
2176 Regardless of the severe mental illness of the x86/svr4 SDB, we
2177 do something sensible here and we use the following DWARF
2178 register numbers. Note that these are all stack-top-relative
2179 numbers.
2180 11 for %st(0) (gcc regno = 8)
2181 12 for %st(1) (gcc regno = 9)
2182 13 for %st(2) (gcc regno = 10)
2183 14 for %st(3) (gcc regno = 11)
2184 15 for %st(4) (gcc regno = 12)
2185 16 for %st(5) (gcc regno = 13)
2186 17 for %st(6) (gcc regno = 14)
2187 18 for %st(7) (gcc regno = 15)
2189 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2191 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2192 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2193 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2194 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2195 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2196 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2197 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2200 /* Define parameter passing and return registers. */
2202 static int const x86_64_int_parameter_registers[6] =
2204 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2207 static int const x86_64_ms_abi_int_parameter_registers[4] =
2209 CX_REG, DX_REG, R8_REG, R9_REG
2212 static int const x86_64_int_return_registers[4] =
2214 AX_REG, DX_REG, DI_REG, SI_REG
2217 /* Define the structure for the machine field in struct function. */
2219 struct GTY(()) stack_local_entry {
2220 unsigned short mode;
2221 unsigned short n;
2222 rtx rtl;
2223 struct stack_local_entry *next;
2226 /* Structure describing stack frame layout.
2227 Stack grows downward:
2229 [arguments]
2230 <- ARG_POINTER
2231 saved pc
2233 saved static chain if ix86_static_chain_on_stack
2235 saved frame pointer if frame_pointer_needed
2236 <- HARD_FRAME_POINTER
2237 [saved regs]
2238 <- regs_save_offset
2239 [padding0]
2241 [saved SSE regs]
2242 <- sse_regs_save_offset
2243 [padding1] |
2244 | <- FRAME_POINTER
2245 [va_arg registers] |
2247 [frame] |
2249 [padding2] | = to_allocate
2250 <- STACK_POINTER
2252 struct ix86_frame
2254 int nsseregs;
2255 int nregs;
2256 int va_arg_size;
2257 int red_zone_size;
2258 int outgoing_arguments_size;
2260 /* The offsets relative to ARG_POINTER. */
2261 HOST_WIDE_INT frame_pointer_offset;
2262 HOST_WIDE_INT hard_frame_pointer_offset;
2263 HOST_WIDE_INT stack_pointer_offset;
2264 HOST_WIDE_INT hfp_save_offset;
2265 HOST_WIDE_INT reg_save_offset;
2266 HOST_WIDE_INT sse_reg_save_offset;
2268 /* When save_regs_using_mov is set, emit prologue using
2269 move instead of push instructions. */
2270 bool save_regs_using_mov;
2273 /* Which cpu are we scheduling for. */
2274 enum attr_cpu ix86_schedule;
2276 /* Which cpu are we optimizing for. */
2277 enum processor_type ix86_tune;
2279 /* Which instruction set architecture to use. */
2280 enum processor_type ix86_arch;
2282 /* True if processor has SSE prefetch instruction. */
2283 unsigned char x86_prefetch_sse;
2285 /* -mstackrealign option */
2286 static const char ix86_force_align_arg_pointer_string[]
2287 = "force_align_arg_pointer";
2289 static rtx (*ix86_gen_leave) (void);
2290 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2291 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2292 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2293 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2294 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2295 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2296 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2297 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2298 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2299 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2300 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2302 /* Preferred alignment for stack boundary in bits. */
2303 unsigned int ix86_preferred_stack_boundary;
2305 /* Alignment for incoming stack boundary in bits specified at
2306 command line. */
2307 static unsigned int ix86_user_incoming_stack_boundary;
2309 /* Default alignment for incoming stack boundary in bits. */
2310 static unsigned int ix86_default_incoming_stack_boundary;
2312 /* Alignment for incoming stack boundary in bits. */
2313 unsigned int ix86_incoming_stack_boundary;
2315 /* Calling abi specific va_list type nodes. */
2316 static GTY(()) tree sysv_va_list_type_node;
2317 static GTY(()) tree ms_va_list_type_node;
2319 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2320 char internal_label_prefix[16];
2321 int internal_label_prefix_len;
2323 /* Fence to use after loop using movnt. */
2324 tree x86_mfence;
2326 /* Register class used for passing given 64bit part of the argument.
2327 These represent classes as documented by the PS ABI, with the exception
2328 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2329 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2331 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2332 whenever possible (upper half does contain padding). */
2333 enum x86_64_reg_class
2335 X86_64_NO_CLASS,
2336 X86_64_INTEGER_CLASS,
2337 X86_64_INTEGERSI_CLASS,
2338 X86_64_SSE_CLASS,
2339 X86_64_SSESF_CLASS,
2340 X86_64_SSEDF_CLASS,
2341 X86_64_SSEUP_CLASS,
2342 X86_64_X87_CLASS,
2343 X86_64_X87UP_CLASS,
2344 X86_64_COMPLEX_X87_CLASS,
2345 X86_64_MEMORY_CLASS
2348 #define MAX_CLASSES 4
2350 /* Table of constants used by fldpi, fldln2, etc.... */
2351 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2352 static bool ext_80387_constants_init = 0;
2355 static struct machine_function * ix86_init_machine_status (void);
2356 static rtx ix86_function_value (const_tree, const_tree, bool);
2357 static bool ix86_function_value_regno_p (const unsigned int);
2358 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2359 const_tree);
2360 static rtx ix86_static_chain (const_tree, bool);
2361 static int ix86_function_regparm (const_tree, const_tree);
2362 static void ix86_compute_frame_layout (struct ix86_frame *);
2363 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2364 rtx, rtx, int);
2365 static void ix86_add_new_builtins (HOST_WIDE_INT);
2366 static tree ix86_canonical_va_list_type (tree);
2367 static void predict_jump (int);
2368 static unsigned int split_stack_prologue_scratch_regno (void);
2369 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2371 enum ix86_function_specific_strings
2373 IX86_FUNCTION_SPECIFIC_ARCH,
2374 IX86_FUNCTION_SPECIFIC_TUNE,
2375 IX86_FUNCTION_SPECIFIC_MAX
2378 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2379 const char *, enum fpmath_unit, bool);
2380 static void ix86_debug_options (void) ATTRIBUTE_UNUSED;
2381 static void ix86_function_specific_save (struct cl_target_option *);
2382 static void ix86_function_specific_restore (struct cl_target_option *);
2383 static void ix86_function_specific_print (FILE *, int,
2384 struct cl_target_option *);
2385 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2386 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2387 struct gcc_options *);
2388 static bool ix86_can_inline_p (tree, tree);
2389 static void ix86_set_current_function (tree);
2390 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2392 static enum calling_abi ix86_function_abi (const_tree);
2395 #ifndef SUBTARGET32_DEFAULT_CPU
2396 #define SUBTARGET32_DEFAULT_CPU "i386"
2397 #endif
2399 /* Whether -mtune= or -march= were specified */
2400 static int ix86_tune_defaulted;
2401 static int ix86_arch_specified;
2403 /* Vectorization library interface and handlers. */
2404 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2406 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2407 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2409 /* Processor target table, indexed by processor number */
2410 struct ptt
2412 const struct processor_costs *cost; /* Processor costs */
2413 const int align_loop; /* Default alignments. */
2414 const int align_loop_max_skip;
2415 const int align_jump;
2416 const int align_jump_max_skip;
2417 const int align_func;
2420 static const struct ptt processor_target_table[PROCESSOR_max] =
2422 {&i386_cost, 4, 3, 4, 3, 4},
2423 {&i486_cost, 16, 15, 16, 15, 16},
2424 {&pentium_cost, 16, 7, 16, 7, 16},
2425 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2426 {&geode_cost, 0, 0, 0, 0, 0},
2427 {&k6_cost, 32, 7, 32, 7, 32},
2428 {&athlon_cost, 16, 7, 16, 7, 16},
2429 {&pentium4_cost, 0, 0, 0, 0, 0},
2430 {&k8_cost, 16, 7, 16, 7, 16},
2431 {&nocona_cost, 0, 0, 0, 0, 0},
2432 /* Core 2 */
2433 {&core_cost, 16, 10, 16, 10, 16},
2434 /* Core i7 */
2435 {&core_cost, 16, 10, 16, 10, 16},
2436 /* Core avx2 */
2437 {&core_cost, 16, 10, 16, 10, 16},
2438 {&generic32_cost, 16, 7, 16, 7, 16},
2439 {&generic64_cost, 16, 10, 16, 10, 16},
2440 {&amdfam10_cost, 32, 24, 32, 7, 32},
2441 {&bdver1_cost, 32, 24, 32, 7, 32},
2442 {&bdver2_cost, 32, 24, 32, 7, 32},
2443 {&bdver3_cost, 32, 24, 32, 7, 32},
2444 {&btver1_cost, 32, 24, 32, 7, 32},
2445 {&btver2_cost, 32, 24, 32, 7, 32},
2446 {&atom_cost, 16, 15, 16, 7, 16}
2449 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2451 "generic",
2452 "i386",
2453 "i486",
2454 "pentium",
2455 "pentium-mmx",
2456 "pentiumpro",
2457 "pentium2",
2458 "pentium3",
2459 "pentium4",
2460 "pentium-m",
2461 "prescott",
2462 "nocona",
2463 "core2",
2464 "corei7",
2465 "core-avx2",
2466 "atom",
2467 "geode",
2468 "k6",
2469 "k6-2",
2470 "k6-3",
2471 "athlon",
2472 "athlon-4",
2473 "k8",
2474 "amdfam10",
2475 "bdver1",
2476 "bdver2",
2477 "bdver3",
2478 "btver1",
2479 "btver2"
2482 static bool
2483 gate_insert_vzeroupper (void)
2485 return TARGET_VZEROUPPER;
2488 static unsigned int
2489 rest_of_handle_insert_vzeroupper (void)
2491 int i;
2493 /* vzeroupper instructions are inserted immediately after reload to
2494 account for possible spills from 256bit registers. The pass
2495 reuses mode switching infrastructure by re-running mode insertion
2496 pass, so disable entities that have already been processed. */
2497 for (i = 0; i < MAX_386_ENTITIES; i++)
2498 ix86_optimize_mode_switching[i] = 0;
2500 ix86_optimize_mode_switching[AVX_U128] = 1;
2502 /* Call optimize_mode_switching. */
2503 pass_mode_switching.pass.execute ();
2504 return 0;
2507 struct rtl_opt_pass pass_insert_vzeroupper =
2510 RTL_PASS,
2511 "vzeroupper", /* name */
2512 OPTGROUP_NONE, /* optinfo_flags */
2513 gate_insert_vzeroupper, /* gate */
2514 rest_of_handle_insert_vzeroupper, /* execute */
2515 NULL, /* sub */
2516 NULL, /* next */
2517 0, /* static_pass_number */
2518 TV_NONE, /* tv_id */
2519 0, /* properties_required */
2520 0, /* properties_provided */
2521 0, /* properties_destroyed */
2522 0, /* todo_flags_start */
2523 TODO_df_finish | TODO_verify_rtl_sharing |
2524 0, /* todo_flags_finish */
2528 /* Return true if a red-zone is in use. */
2530 static inline bool
2531 ix86_using_red_zone (void)
2533 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2536 /* Return a string that documents the current -m options. The caller is
2537 responsible for freeing the string. */
2539 static char *
2540 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2541 const char *tune, enum fpmath_unit fpmath,
2542 bool add_nl_p)
2544 struct ix86_target_opts
2546 const char *option; /* option string */
2547 HOST_WIDE_INT mask; /* isa mask options */
2550 /* This table is ordered so that options like -msse4.2 that imply
2551 preceding options while match those first. */
2552 static struct ix86_target_opts isa_opts[] =
2554 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2555 { "-mfma", OPTION_MASK_ISA_FMA },
2556 { "-mxop", OPTION_MASK_ISA_XOP },
2557 { "-mlwp", OPTION_MASK_ISA_LWP },
2558 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2559 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2560 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2561 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2562 { "-msse3", OPTION_MASK_ISA_SSE3 },
2563 { "-msse2", OPTION_MASK_ISA_SSE2 },
2564 { "-msse", OPTION_MASK_ISA_SSE },
2565 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2566 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2567 { "-mmmx", OPTION_MASK_ISA_MMX },
2568 { "-mabm", OPTION_MASK_ISA_ABM },
2569 { "-mbmi", OPTION_MASK_ISA_BMI },
2570 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2571 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2572 { "-mhle", OPTION_MASK_ISA_HLE },
2573 { "-mfxsr", OPTION_MASK_ISA_FXSR },
2574 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
2575 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
2576 { "-madx", OPTION_MASK_ISA_ADX },
2577 { "-mtbm", OPTION_MASK_ISA_TBM },
2578 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2579 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2580 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2581 { "-maes", OPTION_MASK_ISA_AES },
2582 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2583 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2584 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2585 { "-mf16c", OPTION_MASK_ISA_F16C },
2586 { "-mrtm", OPTION_MASK_ISA_RTM },
2587 { "-mxsave", OPTION_MASK_ISA_XSAVE },
2588 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
2591 /* Flag options. */
2592 static struct ix86_target_opts flag_opts[] =
2594 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2595 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
2596 { "-m80387", MASK_80387 },
2597 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2598 { "-malign-double", MASK_ALIGN_DOUBLE },
2599 { "-mcld", MASK_CLD },
2600 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2601 { "-mieee-fp", MASK_IEEE_FP },
2602 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2603 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2604 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2605 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2606 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2607 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2608 { "-mno-red-zone", MASK_NO_RED_ZONE },
2609 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2610 { "-mrecip", MASK_RECIP },
2611 { "-mrtd", MASK_RTD },
2612 { "-msseregparm", MASK_SSEREGPARM },
2613 { "-mstack-arg-probe", MASK_STACK_PROBE },
2614 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2615 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2616 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2617 { "-mvzeroupper", MASK_VZEROUPPER },
2618 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2619 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2620 { "-mprefer-avx128", MASK_PREFER_AVX128},
2623 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2625 char isa_other[40];
2626 char target_other[40];
2627 unsigned num = 0;
2628 unsigned i, j;
2629 char *ret;
2630 char *ptr;
2631 size_t len;
2632 size_t line_len;
2633 size_t sep_len;
2634 const char *abi;
2636 memset (opts, '\0', sizeof (opts));
2638 /* Add -march= option. */
2639 if (arch)
2641 opts[num][0] = "-march=";
2642 opts[num++][1] = arch;
2645 /* Add -mtune= option. */
2646 if (tune)
2648 opts[num][0] = "-mtune=";
2649 opts[num++][1] = tune;
2652 /* Add -m32/-m64/-mx32. */
2653 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
2655 if ((isa & OPTION_MASK_ABI_64) != 0)
2656 abi = "-m64";
2657 else
2658 abi = "-mx32";
2659 isa &= ~ (OPTION_MASK_ISA_64BIT
2660 | OPTION_MASK_ABI_64
2661 | OPTION_MASK_ABI_X32);
2663 else
2664 abi = "-m32";
2665 opts[num++][0] = abi;
2667 /* Pick out the options in isa options. */
2668 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2670 if ((isa & isa_opts[i].mask) != 0)
2672 opts[num++][0] = isa_opts[i].option;
2673 isa &= ~ isa_opts[i].mask;
2677 if (isa && add_nl_p)
2679 opts[num++][0] = isa_other;
2680 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2681 isa);
2684 /* Add flag options. */
2685 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2687 if ((flags & flag_opts[i].mask) != 0)
2689 opts[num++][0] = flag_opts[i].option;
2690 flags &= ~ flag_opts[i].mask;
2694 if (flags && add_nl_p)
2696 opts[num++][0] = target_other;
2697 sprintf (target_other, "(other flags: %#x)", flags);
2700 /* Add -fpmath= option. */
2701 if (fpmath)
2703 opts[num][0] = "-mfpmath=";
2704 switch ((int) fpmath)
2706 case FPMATH_387:
2707 opts[num++][1] = "387";
2708 break;
2710 case FPMATH_SSE:
2711 opts[num++][1] = "sse";
2712 break;
2714 case FPMATH_387 | FPMATH_SSE:
2715 opts[num++][1] = "sse+387";
2716 break;
2718 default:
2719 gcc_unreachable ();
2723 /* Any options? */
2724 if (num == 0)
2725 return NULL;
2727 gcc_assert (num < ARRAY_SIZE (opts));
2729 /* Size the string. */
2730 len = 0;
2731 sep_len = (add_nl_p) ? 3 : 1;
2732 for (i = 0; i < num; i++)
2734 len += sep_len;
2735 for (j = 0; j < 2; j++)
2736 if (opts[i][j])
2737 len += strlen (opts[i][j]);
2740 /* Build the string. */
2741 ret = ptr = (char *) xmalloc (len);
2742 line_len = 0;
2744 for (i = 0; i < num; i++)
2746 size_t len2[2];
2748 for (j = 0; j < 2; j++)
2749 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2751 if (i != 0)
2753 *ptr++ = ' ';
2754 line_len++;
2756 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2758 *ptr++ = '\\';
2759 *ptr++ = '\n';
2760 line_len = 0;
2764 for (j = 0; j < 2; j++)
2765 if (opts[i][j])
2767 memcpy (ptr, opts[i][j], len2[j]);
2768 ptr += len2[j];
2769 line_len += len2[j];
2773 *ptr = '\0';
2774 gcc_assert (ret + len >= ptr);
2776 return ret;
2779 /* Return true, if profiling code should be emitted before
2780 prologue. Otherwise it returns false.
2781 Note: For x86 with "hotfix" it is sorried. */
2782 static bool
2783 ix86_profile_before_prologue (void)
2785 return flag_fentry != 0;
2788 /* Function that is callable from the debugger to print the current
2789 options. */
2790 void
2791 ix86_debug_options (void)
2793 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2794 ix86_arch_string, ix86_tune_string,
2795 ix86_fpmath, true);
2797 if (opts)
2799 fprintf (stderr, "%s\n\n", opts);
2800 free (opts);
2802 else
2803 fputs ("<no options>\n\n", stderr);
2805 return;
2808 /* Override various settings based on options. If MAIN_ARGS_P, the
2809 options are from the command line, otherwise they are from
2810 attributes. */
2812 static void
2813 ix86_option_override_internal (bool main_args_p)
2815 int i;
2816 unsigned int ix86_arch_mask, ix86_tune_mask;
2817 const bool ix86_tune_specified = (ix86_tune_string != NULL);
2818 const char *prefix;
2819 const char *suffix;
2820 const char *sw;
2822 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
2823 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
2824 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
2825 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
2826 #define PTA_AES (HOST_WIDE_INT_1 << 4)
2827 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
2828 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
2829 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
2830 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
2831 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
2832 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
2833 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
2834 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
2835 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
2836 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
2837 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
2838 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
2839 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
2840 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
2841 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
2842 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
2843 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
2844 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
2845 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
2846 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
2847 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
2848 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
2849 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
2850 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
2851 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
2852 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
2853 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
2854 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
2855 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
2856 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
2857 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
2858 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
2859 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
2860 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
2861 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
2863 /* if this reaches 64, need to widen struct pta flags below */
2865 static struct pta
2867 const char *const name; /* processor name or nickname. */
2868 const enum processor_type processor;
2869 const enum attr_cpu schedule;
2870 const unsigned HOST_WIDE_INT flags;
2872 const processor_alias_table[] =
2874 {"i386", PROCESSOR_I386, CPU_NONE, 0},
2875 {"i486", PROCESSOR_I486, CPU_NONE, 0},
2876 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2877 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2878 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
2879 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
2880 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2881 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2882 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_SSE},
2883 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2884 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2885 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
2886 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2887 PTA_MMX | PTA_SSE | PTA_FXSR},
2888 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2889 PTA_MMX | PTA_SSE | PTA_FXSR},
2890 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2891 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
2892 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
2893 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
2894 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
2895 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
2896 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
2897 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
2898 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
2899 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2900 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
2901 {"core2", PROCESSOR_CORE2, CPU_CORE2,
2902 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2903 | PTA_SSSE3 | PTA_CX16 | PTA_FXSR},
2904 {"corei7", PROCESSOR_COREI7, CPU_COREI7,
2905 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2906 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_FXSR},
2907 {"corei7-avx", PROCESSOR_COREI7, CPU_COREI7,
2908 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2909 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2910 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL
2911 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
2912 {"core-avx-i", PROCESSOR_COREI7, CPU_COREI7,
2913 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2914 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2915 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2916 | PTA_RDRND | PTA_F16C | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
2917 {"core-avx2", PROCESSOR_HASWELL, CPU_COREI7,
2918 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2919 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
2920 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2921 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
2922 | PTA_FMA | PTA_MOVBE | PTA_RTM | PTA_HLE | PTA_FXSR | PTA_XSAVE
2923 | PTA_XSAVEOPT},
2924 {"atom", PROCESSOR_ATOM, CPU_ATOM,
2925 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2926 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE | PTA_FXSR},
2927 {"geode", PROCESSOR_GEODE, CPU_GEODE,
2928 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
2929 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
2930 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
2931 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
2932 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
2933 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
2934 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
2935 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
2936 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
2937 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
2938 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
2939 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
2940 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
2941 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
2942 {"x86-64", PROCESSOR_K8, CPU_K8,
2943 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF},
2944 {"k8", PROCESSOR_K8, CPU_K8,
2945 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2946 | PTA_SSE2 | PTA_NO_SAHF},
2947 {"k8-sse3", PROCESSOR_K8, CPU_K8,
2948 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2949 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
2950 {"opteron", PROCESSOR_K8, CPU_K8,
2951 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2952 | PTA_SSE2 | PTA_NO_SAHF},
2953 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
2954 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2955 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
2956 {"athlon64", PROCESSOR_K8, CPU_K8,
2957 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2958 | PTA_SSE2 | PTA_NO_SAHF},
2959 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
2960 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2961 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
2962 {"athlon-fx", PROCESSOR_K8, CPU_K8,
2963 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2964 | PTA_SSE2 | PTA_NO_SAHF},
2965 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
2966 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2967 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
2968 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
2969 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2970 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
2971 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
2972 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2973 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
2974 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
2975 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
2976 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
2977 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2978 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
2979 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
2980 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
2981 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
2982 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
2983 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2984 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
2985 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
2986 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
2987 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
2988 | PTA_XSAVEOPT},
2989 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC64,
2990 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2991 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
2992 | PTA_FXSR | PTA_XSAVE},
2993 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
2994 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2995 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
2996 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
2997 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
2998 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3000 {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO,
3001 PTA_HLE /* flags are only used for -march switch. */ },
3002 {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64,
3003 PTA_64BIT
3004 | PTA_HLE /* flags are only used for -march switch. */ },
3007 /* -mrecip options. */
3008 static struct
3010 const char *string; /* option name */
3011 unsigned int mask; /* mask bits to set */
3013 const recip_options[] =
3015 { "all", RECIP_MASK_ALL },
3016 { "none", RECIP_MASK_NONE },
3017 { "div", RECIP_MASK_DIV },
3018 { "sqrt", RECIP_MASK_SQRT },
3019 { "vec-div", RECIP_MASK_VEC_DIV },
3020 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3023 int const pta_size = ARRAY_SIZE (processor_alias_table);
3025 /* Set up prefix/suffix so the error messages refer to either the command
3026 line argument, or the attribute(target). */
3027 if (main_args_p)
3029 prefix = "-m";
3030 suffix = "";
3031 sw = "switch";
3033 else
3035 prefix = "option(\"";
3036 suffix = "\")";
3037 sw = "attribute";
3040 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
3041 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
3042 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT)
3043 ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
3044 #ifdef TARGET_BI_ARCH
3045 else
3047 #if TARGET_BI_ARCH == 1
3048 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
3049 is on and OPTION_MASK_ABI_X32 is off. We turn off
3050 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
3051 -mx32. */
3052 if (TARGET_X32)
3053 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3054 #else
3055 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
3056 on and OPTION_MASK_ABI_64 is off. We turn off
3057 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
3058 -m64. */
3059 if (TARGET_LP64)
3060 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3061 #endif
3063 #endif
3065 if (TARGET_X32)
3067 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3068 OPTION_MASK_ABI_64 for TARGET_X32. */
3069 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3070 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3072 else if (TARGET_LP64)
3074 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3075 OPTION_MASK_ABI_X32 for TARGET_LP64. */
3076 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3077 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3080 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3081 SUBTARGET_OVERRIDE_OPTIONS;
3082 #endif
3084 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3085 SUBSUBTARGET_OVERRIDE_OPTIONS;
3086 #endif
3088 /* -fPIC is the default for x86_64. */
3089 if (TARGET_MACHO && TARGET_64BIT)
3090 flag_pic = 2;
3092 /* Need to check -mtune=generic first. */
3093 if (ix86_tune_string)
3095 if (!strcmp (ix86_tune_string, "generic")
3096 || !strcmp (ix86_tune_string, "i686")
3097 /* As special support for cross compilers we read -mtune=native
3098 as -mtune=generic. With native compilers we won't see the
3099 -mtune=native, as it was changed by the driver. */
3100 || !strcmp (ix86_tune_string, "native"))
3102 if (TARGET_64BIT)
3103 ix86_tune_string = "generic64";
3104 else
3105 ix86_tune_string = "generic32";
3107 /* If this call is for setting the option attribute, allow the
3108 generic32/generic64 that was previously set. */
3109 else if (!main_args_p
3110 && (!strcmp (ix86_tune_string, "generic32")
3111 || !strcmp (ix86_tune_string, "generic64")))
3113 else if (!strncmp (ix86_tune_string, "generic", 7))
3114 error ("bad value (%s) for %stune=%s %s",
3115 ix86_tune_string, prefix, suffix, sw);
3116 else if (!strcmp (ix86_tune_string, "x86-64"))
3117 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3118 "%stune=k8%s or %stune=generic%s instead as appropriate",
3119 prefix, suffix, prefix, suffix, prefix, suffix);
3121 else
3123 if (ix86_arch_string)
3124 ix86_tune_string = ix86_arch_string;
3125 if (!ix86_tune_string)
3127 ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3128 ix86_tune_defaulted = 1;
3131 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
3132 need to use a sensible tune option. */
3133 if (!strcmp (ix86_tune_string, "generic")
3134 || !strcmp (ix86_tune_string, "x86-64")
3135 || !strcmp (ix86_tune_string, "i686"))
3137 if (TARGET_64BIT)
3138 ix86_tune_string = "generic64";
3139 else
3140 ix86_tune_string = "generic32";
3144 if (ix86_stringop_alg == rep_prefix_8_byte && !TARGET_64BIT)
3146 /* rep; movq isn't available in 32-bit code. */
3147 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3148 ix86_stringop_alg = no_stringop;
3151 if (!ix86_arch_string)
3152 ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3153 else
3154 ix86_arch_specified = 1;
3156 if (global_options_set.x_ix86_pmode)
3158 if ((TARGET_LP64 && ix86_pmode == PMODE_SI)
3159 || (!TARGET_64BIT && ix86_pmode == PMODE_DI))
3160 error ("address mode %qs not supported in the %s bit mode",
3161 TARGET_64BIT ? "short" : "long",
3162 TARGET_64BIT ? "64" : "32");
3164 else
3165 ix86_pmode = TARGET_LP64 ? PMODE_DI : PMODE_SI;
3167 if (!global_options_set.x_ix86_abi)
3168 ix86_abi = DEFAULT_ABI;
3170 if (global_options_set.x_ix86_cmodel)
3172 switch (ix86_cmodel)
3174 case CM_SMALL:
3175 case CM_SMALL_PIC:
3176 if (flag_pic)
3177 ix86_cmodel = CM_SMALL_PIC;
3178 if (!TARGET_64BIT)
3179 error ("code model %qs not supported in the %s bit mode",
3180 "small", "32");
3181 break;
3183 case CM_MEDIUM:
3184 case CM_MEDIUM_PIC:
3185 if (flag_pic)
3186 ix86_cmodel = CM_MEDIUM_PIC;
3187 if (!TARGET_64BIT)
3188 error ("code model %qs not supported in the %s bit mode",
3189 "medium", "32");
3190 else if (TARGET_X32)
3191 error ("code model %qs not supported in x32 mode",
3192 "medium");
3193 break;
3195 case CM_LARGE:
3196 case CM_LARGE_PIC:
3197 if (flag_pic)
3198 ix86_cmodel = CM_LARGE_PIC;
3199 if (!TARGET_64BIT)
3200 error ("code model %qs not supported in the %s bit mode",
3201 "large", "32");
3202 else if (TARGET_X32)
3203 error ("code model %qs not supported in x32 mode",
3204 "large");
3205 break;
3207 case CM_32:
3208 if (flag_pic)
3209 error ("code model %s does not support PIC mode", "32");
3210 if (TARGET_64BIT)
3211 error ("code model %qs not supported in the %s bit mode",
3212 "32", "64");
3213 break;
3215 case CM_KERNEL:
3216 if (flag_pic)
3218 error ("code model %s does not support PIC mode", "kernel");
3219 ix86_cmodel = CM_32;
3221 if (!TARGET_64BIT)
3222 error ("code model %qs not supported in the %s bit mode",
3223 "kernel", "32");
3224 break;
3226 default:
3227 gcc_unreachable ();
3230 else
3232 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3233 use of rip-relative addressing. This eliminates fixups that
3234 would otherwise be needed if this object is to be placed in a
3235 DLL, and is essentially just as efficient as direct addressing. */
3236 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
3237 ix86_cmodel = CM_SMALL_PIC, flag_pic = 1;
3238 else if (TARGET_64BIT && TARGET_RDOS)
3239 ix86_cmodel = CM_MEDIUM_PIC, flag_pic = 1;
3240 else if (TARGET_64BIT)
3241 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
3242 else
3243 ix86_cmodel = CM_32;
3245 if (TARGET_MACHO && ix86_asm_dialect == ASM_INTEL)
3247 error ("-masm=intel not supported in this configuration");
3248 ix86_asm_dialect = ASM_ATT;
3250 if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3251 sorry ("%i-bit mode not compiled in",
3252 (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3254 for (i = 0; i < pta_size; i++)
3255 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
3257 ix86_schedule = processor_alias_table[i].schedule;
3258 ix86_arch = processor_alias_table[i].processor;
3259 /* Default cpu tuning to the architecture. */
3260 ix86_tune = ix86_arch;
3262 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
3263 error ("CPU you selected does not support x86-64 "
3264 "instruction set");
3266 if (processor_alias_table[i].flags & PTA_MMX
3267 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3268 ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3269 if (processor_alias_table[i].flags & PTA_3DNOW
3270 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3271 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3272 if (processor_alias_table[i].flags & PTA_3DNOW_A
3273 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3274 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3275 if (processor_alias_table[i].flags & PTA_SSE
3276 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3277 ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3278 if (processor_alias_table[i].flags & PTA_SSE2
3279 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3280 ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3281 if (processor_alias_table[i].flags & PTA_SSE3
3282 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3283 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3284 if (processor_alias_table[i].flags & PTA_SSSE3
3285 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3286 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3287 if (processor_alias_table[i].flags & PTA_SSE4_1
3288 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3289 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3290 if (processor_alias_table[i].flags & PTA_SSE4_2
3291 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3292 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3293 if (processor_alias_table[i].flags & PTA_AVX
3294 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3295 ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3296 if (processor_alias_table[i].flags & PTA_AVX2
3297 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3298 ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3299 if (processor_alias_table[i].flags & PTA_FMA
3300 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3301 ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3302 if (processor_alias_table[i].flags & PTA_SSE4A
3303 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3304 ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3305 if (processor_alias_table[i].flags & PTA_FMA4
3306 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3307 ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3308 if (processor_alias_table[i].flags & PTA_XOP
3309 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3310 ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3311 if (processor_alias_table[i].flags & PTA_LWP
3312 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3313 ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3314 if (processor_alias_table[i].flags & PTA_ABM
3315 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3316 ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3317 if (processor_alias_table[i].flags & PTA_BMI
3318 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3319 ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3320 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3321 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3322 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3323 if (processor_alias_table[i].flags & PTA_TBM
3324 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3325 ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3326 if (processor_alias_table[i].flags & PTA_BMI2
3327 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3328 ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3329 if (processor_alias_table[i].flags & PTA_CX16
3330 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3331 ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3332 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3333 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3334 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3335 if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF))
3336 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3337 ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3338 if (processor_alias_table[i].flags & PTA_MOVBE
3339 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3340 ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3341 if (processor_alias_table[i].flags & PTA_AES
3342 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3343 ix86_isa_flags |= OPTION_MASK_ISA_AES;
3344 if (processor_alias_table[i].flags & PTA_PCLMUL
3345 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3346 ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3347 if (processor_alias_table[i].flags & PTA_FSGSBASE
3348 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3349 ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3350 if (processor_alias_table[i].flags & PTA_RDRND
3351 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3352 ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3353 if (processor_alias_table[i].flags & PTA_F16C
3354 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3355 ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3356 if (processor_alias_table[i].flags & PTA_RTM
3357 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
3358 ix86_isa_flags |= OPTION_MASK_ISA_RTM;
3359 if (processor_alias_table[i].flags & PTA_HLE
3360 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
3361 ix86_isa_flags |= OPTION_MASK_ISA_HLE;
3362 if (processor_alias_table[i].flags & PTA_PRFCHW
3363 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
3364 ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
3365 if (processor_alias_table[i].flags & PTA_RDSEED
3366 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
3367 ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
3368 if (processor_alias_table[i].flags & PTA_ADX
3369 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
3370 ix86_isa_flags |= OPTION_MASK_ISA_ADX;
3371 if (processor_alias_table[i].flags & PTA_FXSR
3372 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
3373 ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
3374 if (processor_alias_table[i].flags & PTA_XSAVE
3375 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
3376 ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
3377 if (processor_alias_table[i].flags & PTA_XSAVEOPT
3378 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
3379 ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
3380 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3381 x86_prefetch_sse = true;
3383 break;
3386 if (!strcmp (ix86_arch_string, "generic"))
3387 error ("generic CPU can be used only for %stune=%s %s",
3388 prefix, suffix, sw);
3389 else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size)
3390 error ("bad value (%s) for %sarch=%s %s",
3391 ix86_arch_string, prefix, suffix, sw);
3393 ix86_arch_mask = 1u << ix86_arch;
3394 for (i = 0; i < X86_ARCH_LAST; ++i)
3395 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3397 for (i = 0; i < pta_size; i++)
3398 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
3400 ix86_schedule = processor_alias_table[i].schedule;
3401 ix86_tune = processor_alias_table[i].processor;
3402 if (TARGET_64BIT)
3404 if (!(processor_alias_table[i].flags & PTA_64BIT))
3406 if (ix86_tune_defaulted)
3408 ix86_tune_string = "x86-64";
3409 for (i = 0; i < pta_size; i++)
3410 if (! strcmp (ix86_tune_string,
3411 processor_alias_table[i].name))
3412 break;
3413 ix86_schedule = processor_alias_table[i].schedule;
3414 ix86_tune = processor_alias_table[i].processor;
3416 else
3417 error ("CPU you selected does not support x86-64 "
3418 "instruction set");
3421 else
3423 /* Adjust tuning when compiling for 32-bit ABI. */
3424 switch (ix86_tune)
3426 case PROCESSOR_GENERIC64:
3427 ix86_tune = PROCESSOR_GENERIC32;
3428 ix86_schedule = CPU_PENTIUMPRO;
3429 break;
3431 default:
3432 break;
3435 /* Intel CPUs have always interpreted SSE prefetch instructions as
3436 NOPs; so, we can enable SSE prefetch instructions even when
3437 -mtune (rather than -march) points us to a processor that has them.
3438 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3439 higher processors. */
3440 if (TARGET_CMOV
3441 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3442 x86_prefetch_sse = true;
3443 break;
3446 if (ix86_tune_specified && i == pta_size)
3447 error ("bad value (%s) for %stune=%s %s",
3448 ix86_tune_string, prefix, suffix, sw);
3450 ix86_tune_mask = 1u << ix86_tune;
3451 for (i = 0; i < X86_TUNE_LAST; ++i)
3452 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3454 #ifndef USE_IX86_FRAME_POINTER
3455 #define USE_IX86_FRAME_POINTER 0
3456 #endif
3458 #ifndef USE_X86_64_FRAME_POINTER
3459 #define USE_X86_64_FRAME_POINTER 0
3460 #endif
3462 /* Set the default values for switches whose default depends on TARGET_64BIT
3463 in case they weren't overwritten by command line options. */
3464 if (TARGET_64BIT)
3466 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3467 flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3468 if (flag_asynchronous_unwind_tables == 2)
3469 flag_unwind_tables = flag_asynchronous_unwind_tables = 1;
3470 if (flag_pcc_struct_return == 2)
3471 flag_pcc_struct_return = 0;
3473 else
3475 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3476 flag_omit_frame_pointer = !(USE_IX86_FRAME_POINTER || optimize_size);
3477 if (flag_asynchronous_unwind_tables == 2)
3478 flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3479 if (flag_pcc_struct_return == 2)
3480 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3483 ix86_tune_cost = processor_target_table[ix86_tune].cost;
3484 if (optimize_size)
3485 ix86_cost = &ix86_size_cost;
3486 else
3487 ix86_cost = ix86_tune_cost;
3489 /* Arrange to set up i386_stack_locals for all functions. */
3490 init_machine_status = ix86_init_machine_status;
3492 /* Validate -mregparm= value. */
3493 if (global_options_set.x_ix86_regparm)
3495 if (TARGET_64BIT)
3496 warning (0, "-mregparm is ignored in 64-bit mode");
3497 if (ix86_regparm > REGPARM_MAX)
3499 error ("-mregparm=%d is not between 0 and %d",
3500 ix86_regparm, REGPARM_MAX);
3501 ix86_regparm = 0;
3504 if (TARGET_64BIT)
3505 ix86_regparm = REGPARM_MAX;
3507 /* Default align_* from the processor table. */
3508 if (align_loops == 0)
3510 align_loops = processor_target_table[ix86_tune].align_loop;
3511 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3513 if (align_jumps == 0)
3515 align_jumps = processor_target_table[ix86_tune].align_jump;
3516 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3518 if (align_functions == 0)
3520 align_functions = processor_target_table[ix86_tune].align_func;
3523 /* Provide default for -mbranch-cost= value. */
3524 if (!global_options_set.x_ix86_branch_cost)
3525 ix86_branch_cost = ix86_cost->branch_cost;
3527 if (TARGET_64BIT)
3529 target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit;
3531 /* Enable by default the SSE and MMX builtins. Do allow the user to
3532 explicitly disable any of these. In particular, disabling SSE and
3533 MMX for kernel code is extremely useful. */
3534 if (!ix86_arch_specified)
3535 ix86_isa_flags
3536 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3537 | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit);
3539 if (TARGET_RTD)
3540 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3542 else
3544 target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit;
3546 if (!ix86_arch_specified)
3547 ix86_isa_flags
3548 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit;
3550 /* i386 ABI does not specify red zone. It still makes sense to use it
3551 when programmer takes care to stack from being destroyed. */
3552 if (!(target_flags_explicit & MASK_NO_RED_ZONE))
3553 target_flags |= MASK_NO_RED_ZONE;
3556 /* Keep nonleaf frame pointers. */
3557 if (flag_omit_frame_pointer)
3558 target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3559 else if (TARGET_OMIT_LEAF_FRAME_POINTER)
3560 flag_omit_frame_pointer = 1;
3562 /* If we're doing fast math, we don't care about comparison order
3563 wrt NaNs. This lets us use a shorter comparison sequence. */
3564 if (flag_finite_math_only)
3565 target_flags &= ~MASK_IEEE_FP;
3567 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3568 since the insns won't need emulation. */
3569 if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
3570 target_flags &= ~MASK_NO_FANCY_MATH_387;
3572 /* Likewise, if the target doesn't have a 387, or we've specified
3573 software floating point, don't use 387 inline intrinsics. */
3574 if (!TARGET_80387)
3575 target_flags |= MASK_NO_FANCY_MATH_387;
3577 /* Turn on MMX builtins for -msse. */
3578 if (TARGET_SSE)
3579 ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit;
3581 /* Enable SSE prefetch. */
3582 if (TARGET_SSE || TARGET_PRFCHW)
3583 x86_prefetch_sse = true;
3585 /* Turn on popcnt instruction for -msse4.2 or -mabm. */
3586 if (TARGET_SSE4_2 || TARGET_ABM)
3587 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT & ~ix86_isa_flags_explicit;
3589 /* Turn on lzcnt instruction for -mabm. */
3590 if (TARGET_ABM)
3591 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT & ~ix86_isa_flags_explicit;
3593 /* Validate -mpreferred-stack-boundary= value or default it to
3594 PREFERRED_STACK_BOUNDARY_DEFAULT. */
3595 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3596 if (global_options_set.x_ix86_preferred_stack_boundary_arg)
3598 int min = (TARGET_64BIT ? (TARGET_SSE ? 4 : 3) : 2);
3599 int max = (TARGET_SEH ? 4 : 12);
3601 if (ix86_preferred_stack_boundary_arg < min
3602 || ix86_preferred_stack_boundary_arg > max)
3604 if (min == max)
3605 error ("-mpreferred-stack-boundary is not supported "
3606 "for this target");
3607 else
3608 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3609 ix86_preferred_stack_boundary_arg, min, max);
3611 else
3612 ix86_preferred_stack_boundary
3613 = (1 << ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3616 /* Set the default value for -mstackrealign. */
3617 if (ix86_force_align_arg_pointer == -1)
3618 ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3620 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3622 /* Validate -mincoming-stack-boundary= value or default it to
3623 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
3624 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3625 if (global_options_set.x_ix86_incoming_stack_boundary_arg)
3627 if (ix86_incoming_stack_boundary_arg < (TARGET_64BIT ? 4 : 2)
3628 || ix86_incoming_stack_boundary_arg > 12)
3629 error ("-mincoming-stack-boundary=%d is not between %d and 12",
3630 ix86_incoming_stack_boundary_arg, TARGET_64BIT ? 4 : 2);
3631 else
3633 ix86_user_incoming_stack_boundary
3634 = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3635 ix86_incoming_stack_boundary
3636 = ix86_user_incoming_stack_boundary;
3640 /* Accept -msseregparm only if at least SSE support is enabled. */
3641 if (TARGET_SSEREGPARM
3642 && ! TARGET_SSE)
3643 error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3645 if (global_options_set.x_ix86_fpmath)
3647 if (ix86_fpmath & FPMATH_SSE)
3649 if (!TARGET_SSE)
3651 warning (0, "SSE instruction set disabled, using 387 arithmetics");
3652 ix86_fpmath = FPMATH_387;
3654 else if ((ix86_fpmath & FPMATH_387) && !TARGET_80387)
3656 warning (0, "387 instruction set disabled, using SSE arithmetics");
3657 ix86_fpmath = FPMATH_SSE;
3661 else
3662 ix86_fpmath = TARGET_FPMATH_DEFAULT;
3664 /* If the i387 is disabled, then do not return values in it. */
3665 if (!TARGET_80387)
3666 target_flags &= ~MASK_FLOAT_RETURNS;
3668 /* Use external vectorized library in vectorizing intrinsics. */
3669 if (global_options_set.x_ix86_veclibabi_type)
3670 switch (ix86_veclibabi_type)
3672 case ix86_veclibabi_type_svml:
3673 ix86_veclib_handler = ix86_veclibabi_svml;
3674 break;
3676 case ix86_veclibabi_type_acml:
3677 ix86_veclib_handler = ix86_veclibabi_acml;
3678 break;
3680 default:
3681 gcc_unreachable ();
3684 if ((!USE_IX86_FRAME_POINTER
3685 || (x86_accumulate_outgoing_args & ix86_tune_mask))
3686 && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3687 && !optimize_size)
3688 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3690 /* ??? Unwind info is not correct around the CFG unless either a frame
3691 pointer is present or M_A_O_A is set. Fixing this requires rewriting
3692 unwind info generation to be aware of the CFG and propagating states
3693 around edges. */
3694 if ((flag_unwind_tables || flag_asynchronous_unwind_tables
3695 || flag_exceptions || flag_non_call_exceptions)
3696 && flag_omit_frame_pointer
3697 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3699 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3700 warning (0, "unwind tables currently require either a frame pointer "
3701 "or %saccumulate-outgoing-args%s for correctness",
3702 prefix, suffix);
3703 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3706 /* If stack probes are required, the space used for large function
3707 arguments on the stack must also be probed, so enable
3708 -maccumulate-outgoing-args so this happens in the prologue. */
3709 if (TARGET_STACK_PROBE
3710 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3712 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3713 warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3714 "for correctness", prefix, suffix);
3715 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3718 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
3720 char *p;
3721 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3722 p = strchr (internal_label_prefix, 'X');
3723 internal_label_prefix_len = p - internal_label_prefix;
3724 *p = '\0';
3727 /* When scheduling description is not available, disable scheduler pass
3728 so it won't slow down the compilation and make x87 code slower. */
3729 if (!TARGET_SCHEDULE)
3730 flag_schedule_insns_after_reload = flag_schedule_insns = 0;
3732 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3733 ix86_tune_cost->simultaneous_prefetches,
3734 global_options.x_param_values,
3735 global_options_set.x_param_values);
3736 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
3737 ix86_tune_cost->prefetch_block,
3738 global_options.x_param_values,
3739 global_options_set.x_param_values);
3740 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
3741 ix86_tune_cost->l1_cache_size,
3742 global_options.x_param_values,
3743 global_options_set.x_param_values);
3744 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
3745 ix86_tune_cost->l2_cache_size,
3746 global_options.x_param_values,
3747 global_options_set.x_param_values);
3749 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
3750 if (flag_prefetch_loop_arrays < 0
3751 && HAVE_prefetch
3752 && (optimize >= 3 || flag_profile_use)
3753 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
3754 flag_prefetch_loop_arrays = 1;
3756 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
3757 can be optimized to ap = __builtin_next_arg (0). */
3758 if (!TARGET_64BIT && !flag_split_stack)
3759 targetm.expand_builtin_va_start = NULL;
3761 if (TARGET_64BIT)
3763 ix86_gen_leave = gen_leave_rex64;
3764 if (Pmode == DImode)
3766 ix86_gen_monitor = gen_sse3_monitor64_di;
3767 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
3768 ix86_gen_tls_local_dynamic_base_64
3769 = gen_tls_local_dynamic_base_64_di;
3771 else
3773 ix86_gen_monitor = gen_sse3_monitor64_si;
3774 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
3775 ix86_gen_tls_local_dynamic_base_64
3776 = gen_tls_local_dynamic_base_64_si;
3779 else
3781 ix86_gen_leave = gen_leave;
3782 ix86_gen_monitor = gen_sse3_monitor;
3785 if (Pmode == DImode)
3787 ix86_gen_add3 = gen_adddi3;
3788 ix86_gen_sub3 = gen_subdi3;
3789 ix86_gen_sub3_carry = gen_subdi3_carry;
3790 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
3791 ix86_gen_andsp = gen_anddi3;
3792 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
3793 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
3794 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
3796 else
3798 ix86_gen_add3 = gen_addsi3;
3799 ix86_gen_sub3 = gen_subsi3;
3800 ix86_gen_sub3_carry = gen_subsi3_carry;
3801 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
3802 ix86_gen_andsp = gen_andsi3;
3803 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
3804 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
3805 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
3808 #ifdef USE_IX86_CLD
3809 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
3810 if (!TARGET_64BIT)
3811 target_flags |= MASK_CLD & ~target_flags_explicit;
3812 #endif
3814 if (!TARGET_64BIT && flag_pic)
3816 if (flag_fentry > 0)
3817 sorry ("-mfentry isn%'t supported for 32-bit in combination "
3818 "with -fpic");
3819 flag_fentry = 0;
3821 else if (TARGET_SEH)
3823 if (flag_fentry == 0)
3824 sorry ("-mno-fentry isn%'t compatible with SEH");
3825 flag_fentry = 1;
3827 else if (flag_fentry < 0)
3829 #if defined(PROFILE_BEFORE_PROLOGUE)
3830 flag_fentry = 1;
3831 #else
3832 flag_fentry = 0;
3833 #endif
3836 if (TARGET_AVX)
3838 /* When not optimize for size, enable vzeroupper optimization for
3839 TARGET_AVX with -fexpensive-optimizations and split 32-byte
3840 AVX unaligned load/store. */
3841 if (!optimize_size)
3843 if (flag_expensive_optimizations
3844 && !(target_flags_explicit & MASK_VZEROUPPER))
3845 target_flags |= MASK_VZEROUPPER;
3846 if ((x86_avx256_split_unaligned_load & ix86_tune_mask)
3847 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
3848 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
3849 if ((x86_avx256_split_unaligned_store & ix86_tune_mask)
3850 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE))
3851 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
3852 /* Enable 128-bit AVX instruction generation
3853 for the auto-vectorizer. */
3854 if (TARGET_AVX128_OPTIMAL
3855 && !(target_flags_explicit & MASK_PREFER_AVX128))
3856 target_flags |= MASK_PREFER_AVX128;
3859 else
3861 /* Disable vzeroupper pass if TARGET_AVX is disabled. */
3862 target_flags &= ~MASK_VZEROUPPER;
3865 if (ix86_recip_name)
3867 char *p = ASTRDUP (ix86_recip_name);
3868 char *q;
3869 unsigned int mask, i;
3870 bool invert;
3872 while ((q = strtok (p, ",")) != NULL)
3874 p = NULL;
3875 if (*q == '!')
3877 invert = true;
3878 q++;
3880 else
3881 invert = false;
3883 if (!strcmp (q, "default"))
3884 mask = RECIP_MASK_ALL;
3885 else
3887 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
3888 if (!strcmp (q, recip_options[i].string))
3890 mask = recip_options[i].mask;
3891 break;
3894 if (i == ARRAY_SIZE (recip_options))
3896 error ("unknown option for -mrecip=%s", q);
3897 invert = false;
3898 mask = RECIP_MASK_NONE;
3902 recip_mask_explicit |= mask;
3903 if (invert)
3904 recip_mask &= ~mask;
3905 else
3906 recip_mask |= mask;
3910 if (TARGET_RECIP)
3911 recip_mask |= RECIP_MASK_ALL & ~recip_mask_explicit;
3912 else if (target_flags_explicit & MASK_RECIP)
3913 recip_mask &= ~(RECIP_MASK_ALL & ~recip_mask_explicit);
3915 /* Default long double to 64-bit for Bionic. */
3916 if (TARGET_HAS_BIONIC
3917 && !(target_flags_explicit & MASK_LONG_DOUBLE_64))
3918 target_flags |= MASK_LONG_DOUBLE_64;
3920 /* Save the initial options in case the user does function specific
3921 options. */
3922 if (main_args_p)
3923 target_option_default_node = target_option_current_node
3924 = build_target_option_node ();
3927 /* Implement the TARGET_OPTION_OVERRIDE hook. */
3929 static void
3930 ix86_option_override (void)
3932 static struct register_pass_info insert_vzeroupper_info
3933 = { &pass_insert_vzeroupper.pass, "reload",
3934 1, PASS_POS_INSERT_AFTER
3937 ix86_option_override_internal (true);
3940 /* This needs to be done at start up. It's convenient to do it here. */
3941 register_pass (&insert_vzeroupper_info);
3944 /* Update register usage after having seen the compiler flags. */
3946 static void
3947 ix86_conditional_register_usage (void)
3949 int i, c_mask;
3950 unsigned int j;
3952 /* The PIC register, if it exists, is fixed. */
3953 j = PIC_OFFSET_TABLE_REGNUM;
3954 if (j != INVALID_REGNUM)
3955 fixed_regs[j] = call_used_regs[j] = 1;
3957 /* For 32-bit targets, squash the REX registers. */
3958 if (! TARGET_64BIT)
3960 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
3961 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3962 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
3963 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3966 /* See the definition of CALL_USED_REGISTERS in i386.h. */
3967 c_mask = (TARGET_64BIT_MS_ABI ? (1 << 3)
3968 : TARGET_64BIT ? (1 << 2)
3969 : (1 << 1));
3971 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
3973 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3975 /* Set/reset conditionally defined registers from
3976 CALL_USED_REGISTERS initializer. */
3977 if (call_used_regs[i] > 1)
3978 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
3980 /* Calculate registers of CLOBBERED_REGS register set
3981 as call used registers from GENERAL_REGS register set. */
3982 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
3983 && call_used_regs[i])
3984 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
3987 /* If MMX is disabled, squash the registers. */
3988 if (! TARGET_MMX)
3989 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3990 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
3991 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3993 /* If SSE is disabled, squash the registers. */
3994 if (! TARGET_SSE)
3995 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3996 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
3997 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3999 /* If the FPU is disabled, squash the registers. */
4000 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
4001 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4002 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
4003 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4007 /* Save the current options */
4009 static void
4010 ix86_function_specific_save (struct cl_target_option *ptr)
4012 ptr->arch = ix86_arch;
4013 ptr->schedule = ix86_schedule;
4014 ptr->tune = ix86_tune;
4015 ptr->branch_cost = ix86_branch_cost;
4016 ptr->tune_defaulted = ix86_tune_defaulted;
4017 ptr->arch_specified = ix86_arch_specified;
4018 ptr->x_ix86_isa_flags_explicit = ix86_isa_flags_explicit;
4019 ptr->ix86_target_flags_explicit = target_flags_explicit;
4020 ptr->x_recip_mask_explicit = recip_mask_explicit;
4022 /* The fields are char but the variables are not; make sure the
4023 values fit in the fields. */
4024 gcc_assert (ptr->arch == ix86_arch);
4025 gcc_assert (ptr->schedule == ix86_schedule);
4026 gcc_assert (ptr->tune == ix86_tune);
4027 gcc_assert (ptr->branch_cost == ix86_branch_cost);
4030 /* Restore the current options */
4032 static void
4033 ix86_function_specific_restore (struct cl_target_option *ptr)
4035 enum processor_type old_tune = ix86_tune;
4036 enum processor_type old_arch = ix86_arch;
4037 unsigned int ix86_arch_mask, ix86_tune_mask;
4038 int i;
4040 ix86_arch = (enum processor_type) ptr->arch;
4041 ix86_schedule = (enum attr_cpu) ptr->schedule;
4042 ix86_tune = (enum processor_type) ptr->tune;
4043 ix86_branch_cost = ptr->branch_cost;
4044 ix86_tune_defaulted = ptr->tune_defaulted;
4045 ix86_arch_specified = ptr->arch_specified;
4046 ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
4047 target_flags_explicit = ptr->ix86_target_flags_explicit;
4048 recip_mask_explicit = ptr->x_recip_mask_explicit;
4050 /* Recreate the arch feature tests if the arch changed */
4051 if (old_arch != ix86_arch)
4053 ix86_arch_mask = 1u << ix86_arch;
4054 for (i = 0; i < X86_ARCH_LAST; ++i)
4055 ix86_arch_features[i]
4056 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4059 /* Recreate the tune optimization tests */
4060 if (old_tune != ix86_tune)
4062 ix86_tune_mask = 1u << ix86_tune;
4063 for (i = 0; i < X86_TUNE_LAST; ++i)
4064 ix86_tune_features[i]
4065 = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
4069 /* Print the current options */
4071 static void
4072 ix86_function_specific_print (FILE *file, int indent,
4073 struct cl_target_option *ptr)
4075 char *target_string
4076 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
4077 NULL, NULL, ptr->x_ix86_fpmath, false);
4079 fprintf (file, "%*sarch = %d (%s)\n",
4080 indent, "",
4081 ptr->arch,
4082 ((ptr->arch < TARGET_CPU_DEFAULT_max)
4083 ? cpu_names[ptr->arch]
4084 : "<unknown>"));
4086 fprintf (file, "%*stune = %d (%s)\n",
4087 indent, "",
4088 ptr->tune,
4089 ((ptr->tune < TARGET_CPU_DEFAULT_max)
4090 ? cpu_names[ptr->tune]
4091 : "<unknown>"));
4093 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
4095 if (target_string)
4097 fprintf (file, "%*s%s\n", indent, "", target_string);
4098 free (target_string);
4103 /* Inner function to process the attribute((target(...))), take an argument and
4104 set the current options from the argument. If we have a list, recursively go
4105 over the list. */
4107 static bool
4108 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4109 struct gcc_options *enum_opts_set)
4111 char *next_optstr;
4112 bool ret = true;
4114 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4115 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4116 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4117 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4118 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
4120 enum ix86_opt_type
4122 ix86_opt_unknown,
4123 ix86_opt_yes,
4124 ix86_opt_no,
4125 ix86_opt_str,
4126 ix86_opt_enum,
4127 ix86_opt_isa
4130 static const struct
4132 const char *string;
4133 size_t len;
4134 enum ix86_opt_type type;
4135 int opt;
4136 int mask;
4137 } attrs[] = {
4138 /* isa options */
4139 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
4140 IX86_ATTR_ISA ("abm", OPT_mabm),
4141 IX86_ATTR_ISA ("bmi", OPT_mbmi),
4142 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
4143 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
4144 IX86_ATTR_ISA ("tbm", OPT_mtbm),
4145 IX86_ATTR_ISA ("aes", OPT_maes),
4146 IX86_ATTR_ISA ("avx", OPT_mavx),
4147 IX86_ATTR_ISA ("avx2", OPT_mavx2),
4148 IX86_ATTR_ISA ("mmx", OPT_mmmx),
4149 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
4150 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
4151 IX86_ATTR_ISA ("sse", OPT_msse),
4152 IX86_ATTR_ISA ("sse2", OPT_msse2),
4153 IX86_ATTR_ISA ("sse3", OPT_msse3),
4154 IX86_ATTR_ISA ("sse4", OPT_msse4),
4155 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
4156 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
4157 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
4158 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
4159 IX86_ATTR_ISA ("fma4", OPT_mfma4),
4160 IX86_ATTR_ISA ("fma", OPT_mfma),
4161 IX86_ATTR_ISA ("xop", OPT_mxop),
4162 IX86_ATTR_ISA ("lwp", OPT_mlwp),
4163 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
4164 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
4165 IX86_ATTR_ISA ("f16c", OPT_mf16c),
4166 IX86_ATTR_ISA ("rtm", OPT_mrtm),
4167 IX86_ATTR_ISA ("hle", OPT_mhle),
4168 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
4169 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
4170 IX86_ATTR_ISA ("adx", OPT_madx),
4171 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
4172 IX86_ATTR_ISA ("xsave", OPT_mxsave),
4173 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
4175 /* enum options */
4176 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
4178 /* string options */
4179 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
4180 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
4182 /* flag options */
4183 IX86_ATTR_YES ("cld",
4184 OPT_mcld,
4185 MASK_CLD),
4187 IX86_ATTR_NO ("fancy-math-387",
4188 OPT_mfancy_math_387,
4189 MASK_NO_FANCY_MATH_387),
4191 IX86_ATTR_YES ("ieee-fp",
4192 OPT_mieee_fp,
4193 MASK_IEEE_FP),
4195 IX86_ATTR_YES ("inline-all-stringops",
4196 OPT_minline_all_stringops,
4197 MASK_INLINE_ALL_STRINGOPS),
4199 IX86_ATTR_YES ("inline-stringops-dynamically",
4200 OPT_minline_stringops_dynamically,
4201 MASK_INLINE_STRINGOPS_DYNAMICALLY),
4203 IX86_ATTR_NO ("align-stringops",
4204 OPT_mno_align_stringops,
4205 MASK_NO_ALIGN_STRINGOPS),
4207 IX86_ATTR_YES ("recip",
4208 OPT_mrecip,
4209 MASK_RECIP),
4213 /* If this is a list, recurse to get the options. */
4214 if (TREE_CODE (args) == TREE_LIST)
4216 bool ret = true;
4218 for (; args; args = TREE_CHAIN (args))
4219 if (TREE_VALUE (args)
4220 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4221 p_strings, enum_opts_set))
4222 ret = false;
4224 return ret;
4227 else if (TREE_CODE (args) != STRING_CST)
4229 error ("attribute %<target%> argument not a string");
4230 return false;
4233 /* Handle multiple arguments separated by commas. */
4234 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4236 while (next_optstr && *next_optstr != '\0')
4238 char *p = next_optstr;
4239 char *orig_p = p;
4240 char *comma = strchr (next_optstr, ',');
4241 const char *opt_string;
4242 size_t len, opt_len;
4243 int opt;
4244 bool opt_set_p;
4245 char ch;
4246 unsigned i;
4247 enum ix86_opt_type type = ix86_opt_unknown;
4248 int mask = 0;
4250 if (comma)
4252 *comma = '\0';
4253 len = comma - next_optstr;
4254 next_optstr = comma + 1;
4256 else
4258 len = strlen (p);
4259 next_optstr = NULL;
4262 /* Recognize no-xxx. */
4263 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4265 opt_set_p = false;
4266 p += 3;
4267 len -= 3;
4269 else
4270 opt_set_p = true;
4272 /* Find the option. */
4273 ch = *p;
4274 opt = N_OPTS;
4275 for (i = 0; i < ARRAY_SIZE (attrs); i++)
4277 type = attrs[i].type;
4278 opt_len = attrs[i].len;
4279 if (ch == attrs[i].string[0]
4280 && ((type != ix86_opt_str && type != ix86_opt_enum)
4281 ? len == opt_len
4282 : len > opt_len)
4283 && memcmp (p, attrs[i].string, opt_len) == 0)
4285 opt = attrs[i].opt;
4286 mask = attrs[i].mask;
4287 opt_string = attrs[i].string;
4288 break;
4292 /* Process the option. */
4293 if (opt == N_OPTS)
4295 error ("attribute(target(\"%s\")) is unknown", orig_p);
4296 ret = false;
4299 else if (type == ix86_opt_isa)
4301 struct cl_decoded_option decoded;
4303 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4304 ix86_handle_option (&global_options, &global_options_set,
4305 &decoded, input_location);
4308 else if (type == ix86_opt_yes || type == ix86_opt_no)
4310 if (type == ix86_opt_no)
4311 opt_set_p = !opt_set_p;
4313 if (opt_set_p)
4314 target_flags |= mask;
4315 else
4316 target_flags &= ~mask;
4319 else if (type == ix86_opt_str)
4321 if (p_strings[opt])
4323 error ("option(\"%s\") was already specified", opt_string);
4324 ret = false;
4326 else
4327 p_strings[opt] = xstrdup (p + opt_len);
4330 else if (type == ix86_opt_enum)
4332 bool arg_ok;
4333 int value;
4335 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4336 if (arg_ok)
4337 set_option (&global_options, enum_opts_set, opt, value,
4338 p + opt_len, DK_UNSPECIFIED, input_location,
4339 global_dc);
4340 else
4342 error ("attribute(target(\"%s\")) is unknown", orig_p);
4343 ret = false;
4347 else
4348 gcc_unreachable ();
4351 return ret;
4354 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
4356 tree
4357 ix86_valid_target_attribute_tree (tree args)
4359 const char *orig_arch_string = ix86_arch_string;
4360 const char *orig_tune_string = ix86_tune_string;
4361 enum fpmath_unit orig_fpmath_set = global_options_set.x_ix86_fpmath;
4362 int orig_tune_defaulted = ix86_tune_defaulted;
4363 int orig_arch_specified = ix86_arch_specified;
4364 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4365 tree t = NULL_TREE;
4366 int i;
4367 struct cl_target_option *def
4368 = TREE_TARGET_OPTION (target_option_default_node);
4369 struct gcc_options enum_opts_set;
4371 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4373 /* Process each of the options on the chain. */
4374 if (! ix86_valid_target_attribute_inner_p (args, option_strings,
4375 &enum_opts_set))
4376 return error_mark_node;
4378 /* If the changed options are different from the default, rerun
4379 ix86_option_override_internal, and then save the options away.
4380 The string options are are attribute options, and will be undone
4381 when we copy the save structure. */
4382 if (ix86_isa_flags != def->x_ix86_isa_flags
4383 || target_flags != def->x_target_flags
4384 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4385 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4386 || enum_opts_set.x_ix86_fpmath)
4388 /* If we are using the default tune= or arch=, undo the string assigned,
4389 and use the default. */
4390 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4391 ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4392 else if (!orig_arch_specified)
4393 ix86_arch_string = NULL;
4395 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4396 ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4397 else if (orig_tune_defaulted)
4398 ix86_tune_string = NULL;
4400 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
4401 if (enum_opts_set.x_ix86_fpmath)
4402 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4403 else if (!TARGET_64BIT && TARGET_SSE)
4405 ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4406 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4409 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
4410 ix86_option_override_internal (false);
4412 /* Add any builtin functions with the new isa if any. */
4413 ix86_add_new_builtins (ix86_isa_flags);
4415 /* Save the current options unless we are validating options for
4416 #pragma. */
4417 t = build_target_option_node ();
4419 ix86_arch_string = orig_arch_string;
4420 ix86_tune_string = orig_tune_string;
4421 global_options_set.x_ix86_fpmath = orig_fpmath_set;
4423 /* Free up memory allocated to hold the strings */
4424 for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4425 free (option_strings[i]);
4428 return t;
4431 /* Hook to validate attribute((target("string"))). */
4433 static bool
4434 ix86_valid_target_attribute_p (tree fndecl,
4435 tree ARG_UNUSED (name),
4436 tree args,
4437 int ARG_UNUSED (flags))
4439 struct cl_target_option cur_target;
4440 bool ret = true;
4442 /* attribute((target("default"))) does nothing, beyond
4443 affecting multi-versioning. */
4444 if (TREE_VALUE (args)
4445 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
4446 && TREE_CHAIN (args) == NULL_TREE
4447 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
4448 return true;
4450 tree old_optimize = build_optimization_node ();
4451 tree new_target, new_optimize;
4452 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4454 /* If the function changed the optimization levels as well as setting target
4455 options, start with the optimizations specified. */
4456 if (func_optimize && func_optimize != old_optimize)
4457 cl_optimization_restore (&global_options,
4458 TREE_OPTIMIZATION (func_optimize));
4460 /* The target attributes may also change some optimization flags, so update
4461 the optimization options if necessary. */
4462 cl_target_option_save (&cur_target, &global_options);
4463 new_target = ix86_valid_target_attribute_tree (args);
4464 new_optimize = build_optimization_node ();
4466 if (new_target == error_mark_node)
4467 ret = false;
4469 else if (fndecl && new_target)
4471 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4473 if (old_optimize != new_optimize)
4474 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4477 cl_target_option_restore (&global_options, &cur_target);
4479 if (old_optimize != new_optimize)
4480 cl_optimization_restore (&global_options,
4481 TREE_OPTIMIZATION (old_optimize));
4483 return ret;
4487 /* Hook to determine if one function can safely inline another. */
4489 static bool
4490 ix86_can_inline_p (tree caller, tree callee)
4492 bool ret = false;
4493 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4494 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4496 /* If callee has no option attributes, then it is ok to inline. */
4497 if (!callee_tree)
4498 ret = true;
4500 /* If caller has no option attributes, but callee does then it is not ok to
4501 inline. */
4502 else if (!caller_tree)
4503 ret = false;
4505 else
4507 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4508 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4510 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4511 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4512 function. */
4513 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4514 != callee_opts->x_ix86_isa_flags)
4515 ret = false;
4517 /* See if we have the same non-isa options. */
4518 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4519 ret = false;
4521 /* See if arch, tune, etc. are the same. */
4522 else if (caller_opts->arch != callee_opts->arch)
4523 ret = false;
4525 else if (caller_opts->tune != callee_opts->tune)
4526 ret = false;
4528 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4529 ret = false;
4531 else if (caller_opts->branch_cost != callee_opts->branch_cost)
4532 ret = false;
4534 else
4535 ret = true;
4538 return ret;
4542 /* Remember the last target of ix86_set_current_function. */
4543 static GTY(()) tree ix86_previous_fndecl;
4545 /* Establish appropriate back-end context for processing the function
4546 FNDECL. The argument might be NULL to indicate processing at top
4547 level, outside of any function scope. */
4548 static void
4549 ix86_set_current_function (tree fndecl)
4551 /* Only change the context if the function changes. This hook is called
4552 several times in the course of compiling a function, and we don't want to
4553 slow things down too much or call target_reinit when it isn't safe. */
4554 if (fndecl && fndecl != ix86_previous_fndecl)
4556 tree old_tree = (ix86_previous_fndecl
4557 ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4558 : NULL_TREE);
4560 tree new_tree = (fndecl
4561 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4562 : NULL_TREE);
4564 ix86_previous_fndecl = fndecl;
4565 if (old_tree == new_tree)
4568 else if (new_tree)
4570 cl_target_option_restore (&global_options,
4571 TREE_TARGET_OPTION (new_tree));
4572 target_reinit ();
4575 else if (old_tree)
4577 struct cl_target_option *def
4578 = TREE_TARGET_OPTION (target_option_current_node);
4580 cl_target_option_restore (&global_options, def);
4581 target_reinit ();
4587 /* Return true if this goes in large data/bss. */
4589 static bool
4590 ix86_in_large_data_p (tree exp)
4592 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4593 return false;
4595 /* Functions are never large data. */
4596 if (TREE_CODE (exp) == FUNCTION_DECL)
4597 return false;
4599 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4601 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4602 if (strcmp (section, ".ldata") == 0
4603 || strcmp (section, ".lbss") == 0)
4604 return true;
4605 return false;
4607 else
4609 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4611 /* If this is an incomplete type with size 0, then we can't put it
4612 in data because it might be too big when completed. */
4613 if (!size || size > ix86_section_threshold)
4614 return true;
4617 return false;
4620 /* Switch to the appropriate section for output of DECL.
4621 DECL is either a `VAR_DECL' node or a constant of some sort.
4622 RELOC indicates whether forming the initial value of DECL requires
4623 link-time relocations. */
4625 static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT)
4626 ATTRIBUTE_UNUSED;
4628 static section *
4629 x86_64_elf_select_section (tree decl, int reloc,
4630 unsigned HOST_WIDE_INT align)
4632 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4633 && ix86_in_large_data_p (decl))
4635 const char *sname = NULL;
4636 unsigned int flags = SECTION_WRITE;
4637 switch (categorize_decl_for_section (decl, reloc))
4639 case SECCAT_DATA:
4640 sname = ".ldata";
4641 break;
4642 case SECCAT_DATA_REL:
4643 sname = ".ldata.rel";
4644 break;
4645 case SECCAT_DATA_REL_LOCAL:
4646 sname = ".ldata.rel.local";
4647 break;
4648 case SECCAT_DATA_REL_RO:
4649 sname = ".ldata.rel.ro";
4650 break;
4651 case SECCAT_DATA_REL_RO_LOCAL:
4652 sname = ".ldata.rel.ro.local";
4653 break;
4654 case SECCAT_BSS:
4655 sname = ".lbss";
4656 flags |= SECTION_BSS;
4657 break;
4658 case SECCAT_RODATA:
4659 case SECCAT_RODATA_MERGE_STR:
4660 case SECCAT_RODATA_MERGE_STR_INIT:
4661 case SECCAT_RODATA_MERGE_CONST:
4662 sname = ".lrodata";
4663 flags = 0;
4664 break;
4665 case SECCAT_SRODATA:
4666 case SECCAT_SDATA:
4667 case SECCAT_SBSS:
4668 gcc_unreachable ();
4669 case SECCAT_TEXT:
4670 case SECCAT_TDATA:
4671 case SECCAT_TBSS:
4672 /* We don't split these for medium model. Place them into
4673 default sections and hope for best. */
4674 break;
4676 if (sname)
4678 /* We might get called with string constants, but get_named_section
4679 doesn't like them as they are not DECLs. Also, we need to set
4680 flags in that case. */
4681 if (!DECL_P (decl))
4682 return get_section (sname, flags, NULL);
4683 return get_named_section (decl, sname, reloc);
4686 return default_elf_select_section (decl, reloc, align);
4689 /* Build up a unique section name, expressed as a
4690 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
4691 RELOC indicates whether the initial value of EXP requires
4692 link-time relocations. */
4694 static void ATTRIBUTE_UNUSED
4695 x86_64_elf_unique_section (tree decl, int reloc)
4697 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4698 && ix86_in_large_data_p (decl))
4700 const char *prefix = NULL;
4701 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
4702 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
4704 switch (categorize_decl_for_section (decl, reloc))
4706 case SECCAT_DATA:
4707 case SECCAT_DATA_REL:
4708 case SECCAT_DATA_REL_LOCAL:
4709 case SECCAT_DATA_REL_RO:
4710 case SECCAT_DATA_REL_RO_LOCAL:
4711 prefix = one_only ? ".ld" : ".ldata";
4712 break;
4713 case SECCAT_BSS:
4714 prefix = one_only ? ".lb" : ".lbss";
4715 break;
4716 case SECCAT_RODATA:
4717 case SECCAT_RODATA_MERGE_STR:
4718 case SECCAT_RODATA_MERGE_STR_INIT:
4719 case SECCAT_RODATA_MERGE_CONST:
4720 prefix = one_only ? ".lr" : ".lrodata";
4721 break;
4722 case SECCAT_SRODATA:
4723 case SECCAT_SDATA:
4724 case SECCAT_SBSS:
4725 gcc_unreachable ();
4726 case SECCAT_TEXT:
4727 case SECCAT_TDATA:
4728 case SECCAT_TBSS:
4729 /* We don't split these for medium model. Place them into
4730 default sections and hope for best. */
4731 break;
4733 if (prefix)
4735 const char *name, *linkonce;
4736 char *string;
4738 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
4739 name = targetm.strip_name_encoding (name);
4741 /* If we're using one_only, then there needs to be a .gnu.linkonce
4742 prefix to the section name. */
4743 linkonce = one_only ? ".gnu.linkonce" : "";
4745 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
4747 DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
4748 return;
4751 default_unique_section (decl, reloc);
4754 #ifdef COMMON_ASM_OP
4755 /* This says how to output assembler code to declare an
4756 uninitialized external linkage data object.
4758 For medium model x86-64 we need to use .largecomm opcode for
4759 large objects. */
4760 void
4761 x86_elf_aligned_common (FILE *file,
4762 const char *name, unsigned HOST_WIDE_INT size,
4763 int align)
4765 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4766 && size > (unsigned int)ix86_section_threshold)
4767 fputs (".largecomm\t", file);
4768 else
4769 fputs (COMMON_ASM_OP, file);
4770 assemble_name (file, name);
4771 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
4772 size, align / BITS_PER_UNIT);
4774 #endif
4776 /* Utility function for targets to use in implementing
4777 ASM_OUTPUT_ALIGNED_BSS. */
4779 void
4780 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
4781 const char *name, unsigned HOST_WIDE_INT size,
4782 int align)
4784 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4785 && size > (unsigned int)ix86_section_threshold)
4786 switch_to_section (get_named_section (decl, ".lbss", 0));
4787 else
4788 switch_to_section (bss_section);
4789 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
4790 #ifdef ASM_DECLARE_OBJECT_NAME
4791 last_assemble_variable_decl = decl;
4792 ASM_DECLARE_OBJECT_NAME (file, name, decl);
4793 #else
4794 /* Standard thing is just output label for the object. */
4795 ASM_OUTPUT_LABEL (file, name);
4796 #endif /* ASM_DECLARE_OBJECT_NAME */
4797 ASM_OUTPUT_SKIP (file, size ? size : 1);
4800 /* Decide whether we must probe the stack before any space allocation
4801 on this target. It's essentially TARGET_STACK_PROBE except when
4802 -fstack-check causes the stack to be already probed differently. */
4804 bool
4805 ix86_target_stack_probe (void)
4807 /* Do not probe the stack twice if static stack checking is enabled. */
4808 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4809 return false;
4811 return TARGET_STACK_PROBE;
4814 /* Decide whether we can make a sibling call to a function. DECL is the
4815 declaration of the function being targeted by the call and EXP is the
4816 CALL_EXPR representing the call. */
4818 static bool
4819 ix86_function_ok_for_sibcall (tree decl, tree exp)
4821 tree type, decl_or_type;
4822 rtx a, b;
4824 /* If we are generating position-independent code, we cannot sibcall
4825 optimize any indirect call, or a direct call to a global function,
4826 as the PLT requires %ebx be live. (Darwin does not have a PLT.) */
4827 if (!TARGET_MACHO
4828 && !TARGET_64BIT
4829 && flag_pic
4830 && (!decl || !targetm.binds_local_p (decl)))
4831 return false;
4833 /* If we need to align the outgoing stack, then sibcalling would
4834 unalign the stack, which may break the called function. */
4835 if (ix86_minimum_incoming_stack_boundary (true)
4836 < PREFERRED_STACK_BOUNDARY)
4837 return false;
4839 if (decl)
4841 decl_or_type = decl;
4842 type = TREE_TYPE (decl);
4844 else
4846 /* We're looking at the CALL_EXPR, we need the type of the function. */
4847 type = CALL_EXPR_FN (exp); /* pointer expression */
4848 type = TREE_TYPE (type); /* pointer type */
4849 type = TREE_TYPE (type); /* function type */
4850 decl_or_type = type;
4853 /* Check that the return value locations are the same. Like
4854 if we are returning floats on the 80387 register stack, we cannot
4855 make a sibcall from a function that doesn't return a float to a
4856 function that does or, conversely, from a function that does return
4857 a float to a function that doesn't; the necessary stack adjustment
4858 would not be executed. This is also the place we notice
4859 differences in the return value ABI. Note that it is ok for one
4860 of the functions to have void return type as long as the return
4861 value of the other is passed in a register. */
4862 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
4863 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
4864 cfun->decl, false);
4865 if (STACK_REG_P (a) || STACK_REG_P (b))
4867 if (!rtx_equal_p (a, b))
4868 return false;
4870 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
4872 else if (!rtx_equal_p (a, b))
4873 return false;
4875 if (TARGET_64BIT)
4877 /* The SYSV ABI has more call-clobbered registers;
4878 disallow sibcalls from MS to SYSV. */
4879 if (cfun->machine->call_abi == MS_ABI
4880 && ix86_function_type_abi (type) == SYSV_ABI)
4881 return false;
4883 else
4885 /* If this call is indirect, we'll need to be able to use a
4886 call-clobbered register for the address of the target function.
4887 Make sure that all such registers are not used for passing
4888 parameters. Note that DLLIMPORT functions are indirect. */
4889 if (!decl
4890 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
4892 if (ix86_function_regparm (type, NULL) >= 3)
4894 /* ??? Need to count the actual number of registers to be used,
4895 not the possible number of registers. Fix later. */
4896 return false;
4901 /* Otherwise okay. That also includes certain types of indirect calls. */
4902 return true;
4905 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
4906 and "sseregparm" calling convention attributes;
4907 arguments as in struct attribute_spec.handler. */
4909 static tree
4910 ix86_handle_cconv_attribute (tree *node, tree name,
4911 tree args,
4912 int flags ATTRIBUTE_UNUSED,
4913 bool *no_add_attrs)
4915 if (TREE_CODE (*node) != FUNCTION_TYPE
4916 && TREE_CODE (*node) != METHOD_TYPE
4917 && TREE_CODE (*node) != FIELD_DECL
4918 && TREE_CODE (*node) != TYPE_DECL)
4920 warning (OPT_Wattributes, "%qE attribute only applies to functions",
4921 name);
4922 *no_add_attrs = true;
4923 return NULL_TREE;
4926 /* Can combine regparm with all attributes but fastcall, and thiscall. */
4927 if (is_attribute_p ("regparm", name))
4929 tree cst;
4931 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4933 error ("fastcall and regparm attributes are not compatible");
4936 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4938 error ("regparam and thiscall attributes are not compatible");
4941 cst = TREE_VALUE (args);
4942 if (TREE_CODE (cst) != INTEGER_CST)
4944 warning (OPT_Wattributes,
4945 "%qE attribute requires an integer constant argument",
4946 name);
4947 *no_add_attrs = true;
4949 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
4951 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
4952 name, REGPARM_MAX);
4953 *no_add_attrs = true;
4956 return NULL_TREE;
4959 if (TARGET_64BIT)
4961 /* Do not warn when emulating the MS ABI. */
4962 if ((TREE_CODE (*node) != FUNCTION_TYPE
4963 && TREE_CODE (*node) != METHOD_TYPE)
4964 || ix86_function_type_abi (*node) != MS_ABI)
4965 warning (OPT_Wattributes, "%qE attribute ignored",
4966 name);
4967 *no_add_attrs = true;
4968 return NULL_TREE;
4971 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
4972 if (is_attribute_p ("fastcall", name))
4974 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4976 error ("fastcall and cdecl attributes are not compatible");
4978 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
4980 error ("fastcall and stdcall attributes are not compatible");
4982 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
4984 error ("fastcall and regparm attributes are not compatible");
4986 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4988 error ("fastcall and thiscall attributes are not compatible");
4992 /* Can combine stdcall with fastcall (redundant), regparm and
4993 sseregparm. */
4994 else if (is_attribute_p ("stdcall", name))
4996 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4998 error ("stdcall and cdecl attributes are not compatible");
5000 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5002 error ("stdcall and fastcall attributes are not compatible");
5004 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5006 error ("stdcall and thiscall attributes are not compatible");
5010 /* Can combine cdecl with regparm and sseregparm. */
5011 else if (is_attribute_p ("cdecl", name))
5013 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5015 error ("stdcall and cdecl attributes are not compatible");
5017 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5019 error ("fastcall and cdecl attributes are not compatible");
5021 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5023 error ("cdecl and thiscall attributes are not compatible");
5026 else if (is_attribute_p ("thiscall", name))
5028 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
5029 warning (OPT_Wattributes, "%qE attribute is used for none class-method",
5030 name);
5031 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5033 error ("stdcall and thiscall attributes are not compatible");
5035 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5037 error ("fastcall and thiscall attributes are not compatible");
5039 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5041 error ("cdecl and thiscall attributes are not compatible");
5045 /* Can combine sseregparm with all attributes. */
5047 return NULL_TREE;
5050 /* The transactional memory builtins are implicitly regparm or fastcall
5051 depending on the ABI. Override the generic do-nothing attribute that
5052 these builtins were declared with, and replace it with one of the two
5053 attributes that we expect elsewhere. */
5055 static tree
5056 ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
5057 tree args ATTRIBUTE_UNUSED,
5058 int flags ATTRIBUTE_UNUSED,
5059 bool *no_add_attrs)
5061 tree alt;
5063 /* In no case do we want to add the placeholder attribute. */
5064 *no_add_attrs = true;
5066 /* The 64-bit ABI is unchanged for transactional memory. */
5067 if (TARGET_64BIT)
5068 return NULL_TREE;
5070 /* ??? Is there a better way to validate 32-bit windows? We have
5071 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
5072 if (CHECK_STACK_LIMIT > 0)
5073 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
5074 else
5076 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
5077 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
5079 decl_attributes (node, alt, flags);
5081 return NULL_TREE;
5084 /* This function determines from TYPE the calling-convention. */
5086 unsigned int
5087 ix86_get_callcvt (const_tree type)
5089 unsigned int ret = 0;
5090 bool is_stdarg;
5091 tree attrs;
5093 if (TARGET_64BIT)
5094 return IX86_CALLCVT_CDECL;
5096 attrs = TYPE_ATTRIBUTES (type);
5097 if (attrs != NULL_TREE)
5099 if (lookup_attribute ("cdecl", attrs))
5100 ret |= IX86_CALLCVT_CDECL;
5101 else if (lookup_attribute ("stdcall", attrs))
5102 ret |= IX86_CALLCVT_STDCALL;
5103 else if (lookup_attribute ("fastcall", attrs))
5104 ret |= IX86_CALLCVT_FASTCALL;
5105 else if (lookup_attribute ("thiscall", attrs))
5106 ret |= IX86_CALLCVT_THISCALL;
5108 /* Regparam isn't allowed for thiscall and fastcall. */
5109 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
5111 if (lookup_attribute ("regparm", attrs))
5112 ret |= IX86_CALLCVT_REGPARM;
5113 if (lookup_attribute ("sseregparm", attrs))
5114 ret |= IX86_CALLCVT_SSEREGPARM;
5117 if (IX86_BASE_CALLCVT(ret) != 0)
5118 return ret;
5121 is_stdarg = stdarg_p (type);
5122 if (TARGET_RTD && !is_stdarg)
5123 return IX86_CALLCVT_STDCALL | ret;
5125 if (ret != 0
5126 || is_stdarg
5127 || TREE_CODE (type) != METHOD_TYPE
5128 || ix86_function_type_abi (type) != MS_ABI)
5129 return IX86_CALLCVT_CDECL | ret;
5131 return IX86_CALLCVT_THISCALL;
5134 /* Return 0 if the attributes for two types are incompatible, 1 if they
5135 are compatible, and 2 if they are nearly compatible (which causes a
5136 warning to be generated). */
5138 static int
5139 ix86_comp_type_attributes (const_tree type1, const_tree type2)
5141 unsigned int ccvt1, ccvt2;
5143 if (TREE_CODE (type1) != FUNCTION_TYPE
5144 && TREE_CODE (type1) != METHOD_TYPE)
5145 return 1;
5147 ccvt1 = ix86_get_callcvt (type1);
5148 ccvt2 = ix86_get_callcvt (type2);
5149 if (ccvt1 != ccvt2)
5150 return 0;
5151 if (ix86_function_regparm (type1, NULL)
5152 != ix86_function_regparm (type2, NULL))
5153 return 0;
5155 return 1;
5158 /* Return the regparm value for a function with the indicated TYPE and DECL.
5159 DECL may be NULL when calling function indirectly
5160 or considering a libcall. */
5162 static int
5163 ix86_function_regparm (const_tree type, const_tree decl)
5165 tree attr;
5166 int regparm;
5167 unsigned int ccvt;
5169 if (TARGET_64BIT)
5170 return (ix86_function_type_abi (type) == SYSV_ABI
5171 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5172 ccvt = ix86_get_callcvt (type);
5173 regparm = ix86_regparm;
5175 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5177 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5178 if (attr)
5180 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
5181 return regparm;
5184 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5185 return 2;
5186 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5187 return 1;
5189 /* Use register calling convention for local functions when possible. */
5190 if (decl
5191 && TREE_CODE (decl) == FUNCTION_DECL
5192 && optimize
5193 && !(profile_flag && !flag_fentry))
5195 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5196 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5197 if (i && i->local && i->can_change_signature)
5199 int local_regparm, globals = 0, regno;
5201 /* Make sure no regparm register is taken by a
5202 fixed register variable. */
5203 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5204 if (fixed_regs[local_regparm])
5205 break;
5207 /* We don't want to use regparm(3) for nested functions as
5208 these use a static chain pointer in the third argument. */
5209 if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5210 local_regparm = 2;
5212 /* In 32-bit mode save a register for the split stack. */
5213 if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5214 local_regparm = 2;
5216 /* Each fixed register usage increases register pressure,
5217 so less registers should be used for argument passing.
5218 This functionality can be overriden by an explicit
5219 regparm value. */
5220 for (regno = AX_REG; regno <= DI_REG; regno++)
5221 if (fixed_regs[regno])
5222 globals++;
5224 local_regparm
5225 = globals < local_regparm ? local_regparm - globals : 0;
5227 if (local_regparm > regparm)
5228 regparm = local_regparm;
5232 return regparm;
5235 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5236 DFmode (2) arguments in SSE registers for a function with the
5237 indicated TYPE and DECL. DECL may be NULL when calling function
5238 indirectly or considering a libcall. Otherwise return 0. */
5240 static int
5241 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5243 gcc_assert (!TARGET_64BIT);
5245 /* Use SSE registers to pass SFmode and DFmode arguments if requested
5246 by the sseregparm attribute. */
5247 if (TARGET_SSEREGPARM
5248 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5250 if (!TARGET_SSE)
5252 if (warn)
5254 if (decl)
5255 error ("calling %qD with attribute sseregparm without "
5256 "SSE/SSE2 enabled", decl);
5257 else
5258 error ("calling %qT with attribute sseregparm without "
5259 "SSE/SSE2 enabled", type);
5261 return 0;
5264 return 2;
5267 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5268 (and DFmode for SSE2) arguments in SSE registers. */
5269 if (decl && TARGET_SSE_MATH && optimize
5270 && !(profile_flag && !flag_fentry))
5272 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5273 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5274 if (i && i->local && i->can_change_signature)
5275 return TARGET_SSE2 ? 2 : 1;
5278 return 0;
5281 /* Return true if EAX is live at the start of the function. Used by
5282 ix86_expand_prologue to determine if we need special help before
5283 calling allocate_stack_worker. */
5285 static bool
5286 ix86_eax_live_at_start_p (void)
5288 /* Cheat. Don't bother working forward from ix86_function_regparm
5289 to the function type to whether an actual argument is located in
5290 eax. Instead just look at cfg info, which is still close enough
5291 to correct at this point. This gives false positives for broken
5292 functions that might use uninitialized data that happens to be
5293 allocated in eax, but who cares? */
5294 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
5297 static bool
5298 ix86_keep_aggregate_return_pointer (tree fntype)
5300 tree attr;
5302 if (!TARGET_64BIT)
5304 attr = lookup_attribute ("callee_pop_aggregate_return",
5305 TYPE_ATTRIBUTES (fntype));
5306 if (attr)
5307 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5309 /* For 32-bit MS-ABI the default is to keep aggregate
5310 return pointer. */
5311 if (ix86_function_type_abi (fntype) == MS_ABI)
5312 return true;
5314 return KEEP_AGGREGATE_RETURN_POINTER != 0;
5317 /* Value is the number of bytes of arguments automatically
5318 popped when returning from a subroutine call.
5319 FUNDECL is the declaration node of the function (as a tree),
5320 FUNTYPE is the data type of the function (as a tree),
5321 or for a library call it is an identifier node for the subroutine name.
5322 SIZE is the number of bytes of arguments passed on the stack.
5324 On the 80386, the RTD insn may be used to pop them if the number
5325 of args is fixed, but if the number is variable then the caller
5326 must pop them all. RTD can't be used for library calls now
5327 because the library is compiled with the Unix compiler.
5328 Use of RTD is a selectable option, since it is incompatible with
5329 standard Unix calling sequences. If the option is not selected,
5330 the caller must always pop the args.
5332 The attribute stdcall is equivalent to RTD on a per module basis. */
5334 static int
5335 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5337 unsigned int ccvt;
5339 /* None of the 64-bit ABIs pop arguments. */
5340 if (TARGET_64BIT)
5341 return 0;
5343 ccvt = ix86_get_callcvt (funtype);
5345 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5346 | IX86_CALLCVT_THISCALL)) != 0
5347 && ! stdarg_p (funtype))
5348 return size;
5350 /* Lose any fake structure return argument if it is passed on the stack. */
5351 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5352 && !ix86_keep_aggregate_return_pointer (funtype))
5354 int nregs = ix86_function_regparm (funtype, fundecl);
5355 if (nregs == 0)
5356 return GET_MODE_SIZE (Pmode);
5359 return 0;
5362 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
5364 static bool
5365 ix86_legitimate_combined_insn (rtx insn)
5367 /* Check operand constraints in case hard registers were propagated
5368 into insn pattern. This check prevents combine pass from
5369 generating insn patterns with invalid hard register operands.
5370 These invalid insns can eventually confuse reload to error out
5371 with a spill failure. See also PRs 46829 and 46843. */
5372 if ((INSN_CODE (insn) = recog (PATTERN (insn), insn, 0)) >= 0)
5374 int i;
5376 extract_insn (insn);
5377 preprocess_constraints ();
5379 for (i = 0; i < recog_data.n_operands; i++)
5381 rtx op = recog_data.operand[i];
5382 enum machine_mode mode = GET_MODE (op);
5383 struct operand_alternative *op_alt;
5384 int offset = 0;
5385 bool win;
5386 int j;
5388 /* A unary operator may be accepted by the predicate, but it
5389 is irrelevant for matching constraints. */
5390 if (UNARY_P (op))
5391 op = XEXP (op, 0);
5393 if (GET_CODE (op) == SUBREG)
5395 if (REG_P (SUBREG_REG (op))
5396 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
5397 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
5398 GET_MODE (SUBREG_REG (op)),
5399 SUBREG_BYTE (op),
5400 GET_MODE (op));
5401 op = SUBREG_REG (op);
5404 if (!(REG_P (op) && HARD_REGISTER_P (op)))
5405 continue;
5407 op_alt = recog_op_alt[i];
5409 /* Operand has no constraints, anything is OK. */
5410 win = !recog_data.n_alternatives;
5412 for (j = 0; j < recog_data.n_alternatives; j++)
5414 if (op_alt[j].anything_ok
5415 || (op_alt[j].matches != -1
5416 && operands_match_p
5417 (recog_data.operand[i],
5418 recog_data.operand[op_alt[j].matches]))
5419 || reg_fits_class_p (op, op_alt[j].cl, offset, mode))
5421 win = true;
5422 break;
5426 if (!win)
5427 return false;
5431 return true;
5434 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
5436 static unsigned HOST_WIDE_INT
5437 ix86_asan_shadow_offset (void)
5439 return TARGET_LP64 ? (HOST_WIDE_INT_1 << 44)
5440 : (HOST_WIDE_INT_1 << 29);
5443 /* Argument support functions. */
5445 /* Return true when register may be used to pass function parameters. */
5446 bool
5447 ix86_function_arg_regno_p (int regno)
5449 int i;
5450 const int *parm_regs;
5452 if (!TARGET_64BIT)
5454 if (TARGET_MACHO)
5455 return (regno < REGPARM_MAX
5456 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5457 else
5458 return (regno < REGPARM_MAX
5459 || (TARGET_MMX && MMX_REGNO_P (regno)
5460 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5461 || (TARGET_SSE && SSE_REGNO_P (regno)
5462 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5465 if (TARGET_MACHO)
5467 if (SSE_REGNO_P (regno) && TARGET_SSE)
5468 return true;
5470 else
5472 if (TARGET_SSE && SSE_REGNO_P (regno)
5473 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5474 return true;
5477 /* TODO: The function should depend on current function ABI but
5478 builtins.c would need updating then. Therefore we use the
5479 default ABI. */
5481 /* RAX is used as hidden argument to va_arg functions. */
5482 if (ix86_abi == SYSV_ABI && regno == AX_REG)
5483 return true;
5485 if (ix86_abi == MS_ABI)
5486 parm_regs = x86_64_ms_abi_int_parameter_registers;
5487 else
5488 parm_regs = x86_64_int_parameter_registers;
5489 for (i = 0; i < (ix86_abi == MS_ABI
5490 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5491 if (regno == parm_regs[i])
5492 return true;
5493 return false;
5496 /* Return if we do not know how to pass TYPE solely in registers. */
5498 static bool
5499 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5501 if (must_pass_in_stack_var_size_or_pad (mode, type))
5502 return true;
5504 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
5505 The layout_type routine is crafty and tries to trick us into passing
5506 currently unsupported vector types on the stack by using TImode. */
5507 return (!TARGET_64BIT && mode == TImode
5508 && type && TREE_CODE (type) != VECTOR_TYPE);
5511 /* It returns the size, in bytes, of the area reserved for arguments passed
5512 in registers for the function represented by fndecl dependent to the used
5513 abi format. */
5515 ix86_reg_parm_stack_space (const_tree fndecl)
5517 enum calling_abi call_abi = SYSV_ABI;
5518 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5519 call_abi = ix86_function_abi (fndecl);
5520 else
5521 call_abi = ix86_function_type_abi (fndecl);
5522 if (TARGET_64BIT && call_abi == MS_ABI)
5523 return 32;
5524 return 0;
5527 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5528 call abi used. */
5529 enum calling_abi
5530 ix86_function_type_abi (const_tree fntype)
5532 if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5534 enum calling_abi abi = ix86_abi;
5535 if (abi == SYSV_ABI)
5537 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5538 abi = MS_ABI;
5540 else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5541 abi = SYSV_ABI;
5542 return abi;
5544 return ix86_abi;
5547 static bool
5548 ix86_function_ms_hook_prologue (const_tree fn)
5550 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5552 if (decl_function_context (fn) != NULL_TREE)
5553 error_at (DECL_SOURCE_LOCATION (fn),
5554 "ms_hook_prologue is not compatible with nested function");
5555 else
5556 return true;
5558 return false;
5561 static enum calling_abi
5562 ix86_function_abi (const_tree fndecl)
5564 if (! fndecl)
5565 return ix86_abi;
5566 return ix86_function_type_abi (TREE_TYPE (fndecl));
5569 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5570 call abi used. */
5571 enum calling_abi
5572 ix86_cfun_abi (void)
5574 if (! cfun)
5575 return ix86_abi;
5576 return cfun->machine->call_abi;
5579 /* Write the extra assembler code needed to declare a function properly. */
5581 void
5582 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5583 tree decl)
5585 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5587 if (is_ms_hook)
5589 int i, filler_count = (TARGET_64BIT ? 32 : 16);
5590 unsigned int filler_cc = 0xcccccccc;
5592 for (i = 0; i < filler_count; i += 4)
5593 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5596 #ifdef SUBTARGET_ASM_UNWIND_INIT
5597 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5598 #endif
5600 ASM_OUTPUT_LABEL (asm_out_file, fname);
5602 /* Output magic byte marker, if hot-patch attribute is set. */
5603 if (is_ms_hook)
5605 if (TARGET_64BIT)
5607 /* leaq [%rsp + 0], %rsp */
5608 asm_fprintf (asm_out_file, ASM_BYTE
5609 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5611 else
5613 /* movl.s %edi, %edi
5614 push %ebp
5615 movl.s %esp, %ebp */
5616 asm_fprintf (asm_out_file, ASM_BYTE
5617 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5622 /* regclass.c */
5623 extern void init_regs (void);
5625 /* Implementation of call abi switching target hook. Specific to FNDECL
5626 the specific call register sets are set. See also
5627 ix86_conditional_register_usage for more details. */
5628 void
5629 ix86_call_abi_override (const_tree fndecl)
5631 if (fndecl == NULL_TREE)
5632 cfun->machine->call_abi = ix86_abi;
5633 else
5634 cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
5637 /* 64-bit MS and SYSV ABI have different set of call used registers. Avoid
5638 expensive re-initialization of init_regs each time we switch function context
5639 since this is needed only during RTL expansion. */
5640 static void
5641 ix86_maybe_switch_abi (void)
5643 if (TARGET_64BIT &&
5644 call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
5645 reinit_regs ();
5648 /* Initialize a variable CUM of type CUMULATIVE_ARGS
5649 for a call to a function whose data type is FNTYPE.
5650 For a library call, FNTYPE is 0. */
5652 void
5653 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
5654 tree fntype, /* tree ptr for function decl */
5655 rtx libname, /* SYMBOL_REF of library name or 0 */
5656 tree fndecl,
5657 int caller)
5659 struct cgraph_local_info *i;
5661 memset (cum, 0, sizeof (*cum));
5663 if (fndecl)
5665 i = cgraph_local_info (fndecl);
5666 cum->call_abi = ix86_function_abi (fndecl);
5668 else
5670 i = NULL;
5671 cum->call_abi = ix86_function_type_abi (fntype);
5674 cum->caller = caller;
5676 /* Set up the number of registers to use for passing arguments. */
5678 if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
5679 sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
5680 "or subtarget optimization implying it");
5681 cum->nregs = ix86_regparm;
5682 if (TARGET_64BIT)
5684 cum->nregs = (cum->call_abi == SYSV_ABI
5685 ? X86_64_REGPARM_MAX
5686 : X86_64_MS_REGPARM_MAX);
5688 if (TARGET_SSE)
5690 cum->sse_nregs = SSE_REGPARM_MAX;
5691 if (TARGET_64BIT)
5693 cum->sse_nregs = (cum->call_abi == SYSV_ABI
5694 ? X86_64_SSE_REGPARM_MAX
5695 : X86_64_MS_SSE_REGPARM_MAX);
5698 if (TARGET_MMX)
5699 cum->mmx_nregs = MMX_REGPARM_MAX;
5700 cum->warn_avx = true;
5701 cum->warn_sse = true;
5702 cum->warn_mmx = true;
5704 /* Because type might mismatch in between caller and callee, we need to
5705 use actual type of function for local calls.
5706 FIXME: cgraph_analyze can be told to actually record if function uses
5707 va_start so for local functions maybe_vaarg can be made aggressive
5708 helping K&R code.
5709 FIXME: once typesytem is fixed, we won't need this code anymore. */
5710 if (i && i->local && i->can_change_signature)
5711 fntype = TREE_TYPE (fndecl);
5712 cum->maybe_vaarg = (fntype
5713 ? (!prototype_p (fntype) || stdarg_p (fntype))
5714 : !libname);
5716 if (!TARGET_64BIT)
5718 /* If there are variable arguments, then we won't pass anything
5719 in registers in 32-bit mode. */
5720 if (stdarg_p (fntype))
5722 cum->nregs = 0;
5723 cum->sse_nregs = 0;
5724 cum->mmx_nregs = 0;
5725 cum->warn_avx = 0;
5726 cum->warn_sse = 0;
5727 cum->warn_mmx = 0;
5728 return;
5731 /* Use ecx and edx registers if function has fastcall attribute,
5732 else look for regparm information. */
5733 if (fntype)
5735 unsigned int ccvt = ix86_get_callcvt (fntype);
5736 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5738 cum->nregs = 1;
5739 cum->fastcall = 1; /* Same first register as in fastcall. */
5741 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5743 cum->nregs = 2;
5744 cum->fastcall = 1;
5746 else
5747 cum->nregs = ix86_function_regparm (fntype, fndecl);
5750 /* Set up the number of SSE registers used for passing SFmode
5751 and DFmode arguments. Warn for mismatching ABI. */
5752 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
5756 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
5757 But in the case of vector types, it is some vector mode.
5759 When we have only some of our vector isa extensions enabled, then there
5760 are some modes for which vector_mode_supported_p is false. For these
5761 modes, the generic vector support in gcc will choose some non-vector mode
5762 in order to implement the type. By computing the natural mode, we'll
5763 select the proper ABI location for the operand and not depend on whatever
5764 the middle-end decides to do with these vector types.
5766 The midde-end can't deal with the vector types > 16 bytes. In this
5767 case, we return the original mode and warn ABI change if CUM isn't
5768 NULL. */
5770 static enum machine_mode
5771 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum)
5773 enum machine_mode mode = TYPE_MODE (type);
5775 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
5777 HOST_WIDE_INT size = int_size_in_bytes (type);
5778 if ((size == 8 || size == 16 || size == 32)
5779 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
5780 && TYPE_VECTOR_SUBPARTS (type) > 1)
5782 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
5784 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
5785 mode = MIN_MODE_VECTOR_FLOAT;
5786 else
5787 mode = MIN_MODE_VECTOR_INT;
5789 /* Get the mode which has this inner mode and number of units. */
5790 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
5791 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
5792 && GET_MODE_INNER (mode) == innermode)
5794 if (size == 32 && !TARGET_AVX)
5796 static bool warnedavx;
5798 if (cum
5799 && !warnedavx
5800 && cum->warn_avx)
5802 warnedavx = true;
5803 warning (0, "AVX vector argument without AVX "
5804 "enabled changes the ABI");
5806 return TYPE_MODE (type);
5808 else if ((size == 8 || size == 16) && !TARGET_SSE)
5810 static bool warnedsse;
5812 if (cum
5813 && !warnedsse
5814 && cum->warn_sse)
5816 warnedsse = true;
5817 warning (0, "SSE vector argument without SSE "
5818 "enabled changes the ABI");
5820 return mode;
5822 else
5823 return mode;
5826 gcc_unreachable ();
5830 return mode;
5833 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
5834 this may not agree with the mode that the type system has chosen for the
5835 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
5836 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
5838 static rtx
5839 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
5840 unsigned int regno)
5842 rtx tmp;
5844 if (orig_mode != BLKmode)
5845 tmp = gen_rtx_REG (orig_mode, regno);
5846 else
5848 tmp = gen_rtx_REG (mode, regno);
5849 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
5850 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
5853 return tmp;
5856 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
5857 of this code is to classify each 8bytes of incoming argument by the register
5858 class and assign registers accordingly. */
5860 /* Return the union class of CLASS1 and CLASS2.
5861 See the x86-64 PS ABI for details. */
5863 static enum x86_64_reg_class
5864 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
5866 /* Rule #1: If both classes are equal, this is the resulting class. */
5867 if (class1 == class2)
5868 return class1;
5870 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
5871 the other class. */
5872 if (class1 == X86_64_NO_CLASS)
5873 return class2;
5874 if (class2 == X86_64_NO_CLASS)
5875 return class1;
5877 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
5878 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
5879 return X86_64_MEMORY_CLASS;
5881 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
5882 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
5883 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
5884 return X86_64_INTEGERSI_CLASS;
5885 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
5886 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
5887 return X86_64_INTEGER_CLASS;
5889 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
5890 MEMORY is used. */
5891 if (class1 == X86_64_X87_CLASS
5892 || class1 == X86_64_X87UP_CLASS
5893 || class1 == X86_64_COMPLEX_X87_CLASS
5894 || class2 == X86_64_X87_CLASS
5895 || class2 == X86_64_X87UP_CLASS
5896 || class2 == X86_64_COMPLEX_X87_CLASS)
5897 return X86_64_MEMORY_CLASS;
5899 /* Rule #6: Otherwise class SSE is used. */
5900 return X86_64_SSE_CLASS;
5903 /* Classify the argument of type TYPE and mode MODE.
5904 CLASSES will be filled by the register class used to pass each word
5905 of the operand. The number of words is returned. In case the parameter
5906 should be passed in memory, 0 is returned. As a special case for zero
5907 sized containers, classes[0] will be NO_CLASS and 1 is returned.
5909 BIT_OFFSET is used internally for handling records and specifies offset
5910 of the offset in bits modulo 256 to avoid overflow cases.
5912 See the x86-64 PS ABI for details.
5915 static int
5916 classify_argument (enum machine_mode mode, const_tree type,
5917 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
5919 HOST_WIDE_INT bytes =
5920 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
5921 int words
5922 = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5924 /* Variable sized entities are always passed/returned in memory. */
5925 if (bytes < 0)
5926 return 0;
5928 if (mode != VOIDmode
5929 && targetm.calls.must_pass_in_stack (mode, type))
5930 return 0;
5932 if (type && AGGREGATE_TYPE_P (type))
5934 int i;
5935 tree field;
5936 enum x86_64_reg_class subclasses[MAX_CLASSES];
5938 /* On x86-64 we pass structures larger than 32 bytes on the stack. */
5939 if (bytes > 32)
5940 return 0;
5942 for (i = 0; i < words; i++)
5943 classes[i] = X86_64_NO_CLASS;
5945 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
5946 signalize memory class, so handle it as special case. */
5947 if (!words)
5949 classes[0] = X86_64_NO_CLASS;
5950 return 1;
5953 /* Classify each field of record and merge classes. */
5954 switch (TREE_CODE (type))
5956 case RECORD_TYPE:
5957 /* And now merge the fields of structure. */
5958 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5960 if (TREE_CODE (field) == FIELD_DECL)
5962 int num;
5964 if (TREE_TYPE (field) == error_mark_node)
5965 continue;
5967 /* Bitfields are always classified as integer. Handle them
5968 early, since later code would consider them to be
5969 misaligned integers. */
5970 if (DECL_BIT_FIELD (field))
5972 for (i = (int_bit_position (field)
5973 + (bit_offset % 64)) / 8 / 8;
5974 i < ((int_bit_position (field) + (bit_offset % 64))
5975 + tree_low_cst (DECL_SIZE (field), 0)
5976 + 63) / 8 / 8; i++)
5977 classes[i] =
5978 merge_classes (X86_64_INTEGER_CLASS,
5979 classes[i]);
5981 else
5983 int pos;
5985 type = TREE_TYPE (field);
5987 /* Flexible array member is ignored. */
5988 if (TYPE_MODE (type) == BLKmode
5989 && TREE_CODE (type) == ARRAY_TYPE
5990 && TYPE_SIZE (type) == NULL_TREE
5991 && TYPE_DOMAIN (type) != NULL_TREE
5992 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
5993 == NULL_TREE))
5995 static bool warned;
5997 if (!warned && warn_psabi)
5999 warned = true;
6000 inform (input_location,
6001 "the ABI of passing struct with"
6002 " a flexible array member has"
6003 " changed in GCC 4.4");
6005 continue;
6007 num = classify_argument (TYPE_MODE (type), type,
6008 subclasses,
6009 (int_bit_position (field)
6010 + bit_offset) % 256);
6011 if (!num)
6012 return 0;
6013 pos = (int_bit_position (field)
6014 + (bit_offset % 64)) / 8 / 8;
6015 for (i = 0; i < num && (i + pos) < words; i++)
6016 classes[i + pos] =
6017 merge_classes (subclasses[i], classes[i + pos]);
6021 break;
6023 case ARRAY_TYPE:
6024 /* Arrays are handled as small records. */
6026 int num;
6027 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
6028 TREE_TYPE (type), subclasses, bit_offset);
6029 if (!num)
6030 return 0;
6032 /* The partial classes are now full classes. */
6033 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
6034 subclasses[0] = X86_64_SSE_CLASS;
6035 if (subclasses[0] == X86_64_INTEGERSI_CLASS
6036 && !((bit_offset % 64) == 0 && bytes == 4))
6037 subclasses[0] = X86_64_INTEGER_CLASS;
6039 for (i = 0; i < words; i++)
6040 classes[i] = subclasses[i % num];
6042 break;
6044 case UNION_TYPE:
6045 case QUAL_UNION_TYPE:
6046 /* Unions are similar to RECORD_TYPE but offset is always 0.
6048 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6050 if (TREE_CODE (field) == FIELD_DECL)
6052 int num;
6054 if (TREE_TYPE (field) == error_mark_node)
6055 continue;
6057 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
6058 TREE_TYPE (field), subclasses,
6059 bit_offset);
6060 if (!num)
6061 return 0;
6062 for (i = 0; i < num; i++)
6063 classes[i] = merge_classes (subclasses[i], classes[i]);
6066 break;
6068 default:
6069 gcc_unreachable ();
6072 if (words > 2)
6074 /* When size > 16 bytes, if the first one isn't
6075 X86_64_SSE_CLASS or any other ones aren't
6076 X86_64_SSEUP_CLASS, everything should be passed in
6077 memory. */
6078 if (classes[0] != X86_64_SSE_CLASS)
6079 return 0;
6081 for (i = 1; i < words; i++)
6082 if (classes[i] != X86_64_SSEUP_CLASS)
6083 return 0;
6086 /* Final merger cleanup. */
6087 for (i = 0; i < words; i++)
6089 /* If one class is MEMORY, everything should be passed in
6090 memory. */
6091 if (classes[i] == X86_64_MEMORY_CLASS)
6092 return 0;
6094 /* The X86_64_SSEUP_CLASS should be always preceded by
6095 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
6096 if (classes[i] == X86_64_SSEUP_CLASS
6097 && classes[i - 1] != X86_64_SSE_CLASS
6098 && classes[i - 1] != X86_64_SSEUP_CLASS)
6100 /* The first one should never be X86_64_SSEUP_CLASS. */
6101 gcc_assert (i != 0);
6102 classes[i] = X86_64_SSE_CLASS;
6105 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
6106 everything should be passed in memory. */
6107 if (classes[i] == X86_64_X87UP_CLASS
6108 && (classes[i - 1] != X86_64_X87_CLASS))
6110 static bool warned;
6112 /* The first one should never be X86_64_X87UP_CLASS. */
6113 gcc_assert (i != 0);
6114 if (!warned && warn_psabi)
6116 warned = true;
6117 inform (input_location,
6118 "the ABI of passing union with long double"
6119 " has changed in GCC 4.4");
6121 return 0;
6124 return words;
6127 /* Compute alignment needed. We align all types to natural boundaries with
6128 exception of XFmode that is aligned to 64bits. */
6129 if (mode != VOIDmode && mode != BLKmode)
6131 int mode_alignment = GET_MODE_BITSIZE (mode);
6133 if (mode == XFmode)
6134 mode_alignment = 128;
6135 else if (mode == XCmode)
6136 mode_alignment = 256;
6137 if (COMPLEX_MODE_P (mode))
6138 mode_alignment /= 2;
6139 /* Misaligned fields are always returned in memory. */
6140 if (bit_offset % mode_alignment)
6141 return 0;
6144 /* for V1xx modes, just use the base mode */
6145 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
6146 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
6147 mode = GET_MODE_INNER (mode);
6149 /* Classification of atomic types. */
6150 switch (mode)
6152 case SDmode:
6153 case DDmode:
6154 classes[0] = X86_64_SSE_CLASS;
6155 return 1;
6156 case TDmode:
6157 classes[0] = X86_64_SSE_CLASS;
6158 classes[1] = X86_64_SSEUP_CLASS;
6159 return 2;
6160 case DImode:
6161 case SImode:
6162 case HImode:
6163 case QImode:
6164 case CSImode:
6165 case CHImode:
6166 case CQImode:
6168 int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
6170 if (size <= 32)
6172 classes[0] = X86_64_INTEGERSI_CLASS;
6173 return 1;
6175 else if (size <= 64)
6177 classes[0] = X86_64_INTEGER_CLASS;
6178 return 1;
6180 else if (size <= 64+32)
6182 classes[0] = X86_64_INTEGER_CLASS;
6183 classes[1] = X86_64_INTEGERSI_CLASS;
6184 return 2;
6186 else if (size <= 64+64)
6188 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6189 return 2;
6191 else
6192 gcc_unreachable ();
6194 case CDImode:
6195 case TImode:
6196 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6197 return 2;
6198 case COImode:
6199 case OImode:
6200 /* OImode shouldn't be used directly. */
6201 gcc_unreachable ();
6202 case CTImode:
6203 return 0;
6204 case SFmode:
6205 if (!(bit_offset % 64))
6206 classes[0] = X86_64_SSESF_CLASS;
6207 else
6208 classes[0] = X86_64_SSE_CLASS;
6209 return 1;
6210 case DFmode:
6211 classes[0] = X86_64_SSEDF_CLASS;
6212 return 1;
6213 case XFmode:
6214 classes[0] = X86_64_X87_CLASS;
6215 classes[1] = X86_64_X87UP_CLASS;
6216 return 2;
6217 case TFmode:
6218 classes[0] = X86_64_SSE_CLASS;
6219 classes[1] = X86_64_SSEUP_CLASS;
6220 return 2;
6221 case SCmode:
6222 classes[0] = X86_64_SSE_CLASS;
6223 if (!(bit_offset % 64))
6224 return 1;
6225 else
6227 static bool warned;
6229 if (!warned && warn_psabi)
6231 warned = true;
6232 inform (input_location,
6233 "the ABI of passing structure with complex float"
6234 " member has changed in GCC 4.4");
6236 classes[1] = X86_64_SSESF_CLASS;
6237 return 2;
6239 case DCmode:
6240 classes[0] = X86_64_SSEDF_CLASS;
6241 classes[1] = X86_64_SSEDF_CLASS;
6242 return 2;
6243 case XCmode:
6244 classes[0] = X86_64_COMPLEX_X87_CLASS;
6245 return 1;
6246 case TCmode:
6247 /* This modes is larger than 16 bytes. */
6248 return 0;
6249 case V8SFmode:
6250 case V8SImode:
6251 case V32QImode:
6252 case V16HImode:
6253 case V4DFmode:
6254 case V4DImode:
6255 classes[0] = X86_64_SSE_CLASS;
6256 classes[1] = X86_64_SSEUP_CLASS;
6257 classes[2] = X86_64_SSEUP_CLASS;
6258 classes[3] = X86_64_SSEUP_CLASS;
6259 return 4;
6260 case V4SFmode:
6261 case V4SImode:
6262 case V16QImode:
6263 case V8HImode:
6264 case V2DFmode:
6265 case V2DImode:
6266 classes[0] = X86_64_SSE_CLASS;
6267 classes[1] = X86_64_SSEUP_CLASS;
6268 return 2;
6269 case V1TImode:
6270 case V1DImode:
6271 case V2SFmode:
6272 case V2SImode:
6273 case V4HImode:
6274 case V8QImode:
6275 classes[0] = X86_64_SSE_CLASS;
6276 return 1;
6277 case BLKmode:
6278 case VOIDmode:
6279 return 0;
6280 default:
6281 gcc_assert (VECTOR_MODE_P (mode));
6283 if (bytes > 16)
6284 return 0;
6286 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6288 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6289 classes[0] = X86_64_INTEGERSI_CLASS;
6290 else
6291 classes[0] = X86_64_INTEGER_CLASS;
6292 classes[1] = X86_64_INTEGER_CLASS;
6293 return 1 + (bytes > 8);
6297 /* Examine the argument and return set number of register required in each
6298 class. Return 0 iff parameter should be passed in memory. */
6299 static int
6300 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6301 int *int_nregs, int *sse_nregs)
6303 enum x86_64_reg_class regclass[MAX_CLASSES];
6304 int n = classify_argument (mode, type, regclass, 0);
6306 *int_nregs = 0;
6307 *sse_nregs = 0;
6308 if (!n)
6309 return 0;
6310 for (n--; n >= 0; n--)
6311 switch (regclass[n])
6313 case X86_64_INTEGER_CLASS:
6314 case X86_64_INTEGERSI_CLASS:
6315 (*int_nregs)++;
6316 break;
6317 case X86_64_SSE_CLASS:
6318 case X86_64_SSESF_CLASS:
6319 case X86_64_SSEDF_CLASS:
6320 (*sse_nregs)++;
6321 break;
6322 case X86_64_NO_CLASS:
6323 case X86_64_SSEUP_CLASS:
6324 break;
6325 case X86_64_X87_CLASS:
6326 case X86_64_X87UP_CLASS:
6327 if (!in_return)
6328 return 0;
6329 break;
6330 case X86_64_COMPLEX_X87_CLASS:
6331 return in_return ? 2 : 0;
6332 case X86_64_MEMORY_CLASS:
6333 gcc_unreachable ();
6335 return 1;
6338 /* Construct container for the argument used by GCC interface. See
6339 FUNCTION_ARG for the detailed description. */
6341 static rtx
6342 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6343 const_tree type, int in_return, int nintregs, int nsseregs,
6344 const int *intreg, int sse_regno)
6346 /* The following variables hold the static issued_error state. */
6347 static bool issued_sse_arg_error;
6348 static bool issued_sse_ret_error;
6349 static bool issued_x87_ret_error;
6351 enum machine_mode tmpmode;
6352 int bytes =
6353 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6354 enum x86_64_reg_class regclass[MAX_CLASSES];
6355 int n;
6356 int i;
6357 int nexps = 0;
6358 int needed_sseregs, needed_intregs;
6359 rtx exp[MAX_CLASSES];
6360 rtx ret;
6362 n = classify_argument (mode, type, regclass, 0);
6363 if (!n)
6364 return NULL;
6365 if (!examine_argument (mode, type, in_return, &needed_intregs,
6366 &needed_sseregs))
6367 return NULL;
6368 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6369 return NULL;
6371 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
6372 some less clueful developer tries to use floating-point anyway. */
6373 if (needed_sseregs && !TARGET_SSE)
6375 if (in_return)
6377 if (!issued_sse_ret_error)
6379 error ("SSE register return with SSE disabled");
6380 issued_sse_ret_error = true;
6383 else if (!issued_sse_arg_error)
6385 error ("SSE register argument with SSE disabled");
6386 issued_sse_arg_error = true;
6388 return NULL;
6391 /* Likewise, error if the ABI requires us to return values in the
6392 x87 registers and the user specified -mno-80387. */
6393 if (!TARGET_80387 && in_return)
6394 for (i = 0; i < n; i++)
6395 if (regclass[i] == X86_64_X87_CLASS
6396 || regclass[i] == X86_64_X87UP_CLASS
6397 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6399 if (!issued_x87_ret_error)
6401 error ("x87 register return with x87 disabled");
6402 issued_x87_ret_error = true;
6404 return NULL;
6407 /* First construct simple cases. Avoid SCmode, since we want to use
6408 single register to pass this type. */
6409 if (n == 1 && mode != SCmode)
6410 switch (regclass[0])
6412 case X86_64_INTEGER_CLASS:
6413 case X86_64_INTEGERSI_CLASS:
6414 return gen_rtx_REG (mode, intreg[0]);
6415 case X86_64_SSE_CLASS:
6416 case X86_64_SSESF_CLASS:
6417 case X86_64_SSEDF_CLASS:
6418 if (mode != BLKmode)
6419 return gen_reg_or_parallel (mode, orig_mode,
6420 SSE_REGNO (sse_regno));
6421 break;
6422 case X86_64_X87_CLASS:
6423 case X86_64_COMPLEX_X87_CLASS:
6424 return gen_rtx_REG (mode, FIRST_STACK_REG);
6425 case X86_64_NO_CLASS:
6426 /* Zero sized array, struct or class. */
6427 return NULL;
6428 default:
6429 gcc_unreachable ();
6431 if (n == 2
6432 && regclass[0] == X86_64_SSE_CLASS
6433 && regclass[1] == X86_64_SSEUP_CLASS
6434 && mode != BLKmode)
6435 return gen_reg_or_parallel (mode, orig_mode,
6436 SSE_REGNO (sse_regno));
6437 if (n == 4
6438 && regclass[0] == X86_64_SSE_CLASS
6439 && regclass[1] == X86_64_SSEUP_CLASS
6440 && regclass[2] == X86_64_SSEUP_CLASS
6441 && regclass[3] == X86_64_SSEUP_CLASS
6442 && mode != BLKmode)
6443 return gen_reg_or_parallel (mode, orig_mode,
6444 SSE_REGNO (sse_regno));
6445 if (n == 2
6446 && regclass[0] == X86_64_X87_CLASS
6447 && regclass[1] == X86_64_X87UP_CLASS)
6448 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6450 if (n == 2
6451 && regclass[0] == X86_64_INTEGER_CLASS
6452 && regclass[1] == X86_64_INTEGER_CLASS
6453 && (mode == CDImode || mode == TImode || mode == TFmode)
6454 && intreg[0] + 1 == intreg[1])
6455 return gen_rtx_REG (mode, intreg[0]);
6457 /* Otherwise figure out the entries of the PARALLEL. */
6458 for (i = 0; i < n; i++)
6460 int pos;
6462 switch (regclass[i])
6464 case X86_64_NO_CLASS:
6465 break;
6466 case X86_64_INTEGER_CLASS:
6467 case X86_64_INTEGERSI_CLASS:
6468 /* Merge TImodes on aligned occasions here too. */
6469 if (i * 8 + 8 > bytes)
6470 tmpmode
6471 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6472 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6473 tmpmode = SImode;
6474 else
6475 tmpmode = DImode;
6476 /* We've requested 24 bytes we
6477 don't have mode for. Use DImode. */
6478 if (tmpmode == BLKmode)
6479 tmpmode = DImode;
6480 exp [nexps++]
6481 = gen_rtx_EXPR_LIST (VOIDmode,
6482 gen_rtx_REG (tmpmode, *intreg),
6483 GEN_INT (i*8));
6484 intreg++;
6485 break;
6486 case X86_64_SSESF_CLASS:
6487 exp [nexps++]
6488 = gen_rtx_EXPR_LIST (VOIDmode,
6489 gen_rtx_REG (SFmode,
6490 SSE_REGNO (sse_regno)),
6491 GEN_INT (i*8));
6492 sse_regno++;
6493 break;
6494 case X86_64_SSEDF_CLASS:
6495 exp [nexps++]
6496 = gen_rtx_EXPR_LIST (VOIDmode,
6497 gen_rtx_REG (DFmode,
6498 SSE_REGNO (sse_regno)),
6499 GEN_INT (i*8));
6500 sse_regno++;
6501 break;
6502 case X86_64_SSE_CLASS:
6503 pos = i;
6504 switch (n)
6506 case 1:
6507 tmpmode = DImode;
6508 break;
6509 case 2:
6510 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6512 tmpmode = TImode;
6513 i++;
6515 else
6516 tmpmode = DImode;
6517 break;
6518 case 4:
6519 gcc_assert (i == 0
6520 && regclass[1] == X86_64_SSEUP_CLASS
6521 && regclass[2] == X86_64_SSEUP_CLASS
6522 && regclass[3] == X86_64_SSEUP_CLASS);
6523 tmpmode = OImode;
6524 i += 3;
6525 break;
6526 default:
6527 gcc_unreachable ();
6529 exp [nexps++]
6530 = gen_rtx_EXPR_LIST (VOIDmode,
6531 gen_rtx_REG (tmpmode,
6532 SSE_REGNO (sse_regno)),
6533 GEN_INT (pos*8));
6534 sse_regno++;
6535 break;
6536 default:
6537 gcc_unreachable ();
6541 /* Empty aligned struct, union or class. */
6542 if (nexps == 0)
6543 return NULL;
6545 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6546 for (i = 0; i < nexps; i++)
6547 XVECEXP (ret, 0, i) = exp [i];
6548 return ret;
6551 /* Update the data in CUM to advance over an argument of mode MODE
6552 and data type TYPE. (TYPE is null for libcalls where that information
6553 may not be available.) */
6555 static void
6556 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6557 const_tree type, HOST_WIDE_INT bytes,
6558 HOST_WIDE_INT words)
6560 switch (mode)
6562 default:
6563 break;
6565 case BLKmode:
6566 if (bytes < 0)
6567 break;
6568 /* FALLTHRU */
6570 case DImode:
6571 case SImode:
6572 case HImode:
6573 case QImode:
6574 cum->words += words;
6575 cum->nregs -= words;
6576 cum->regno += words;
6578 if (cum->nregs <= 0)
6580 cum->nregs = 0;
6581 cum->regno = 0;
6583 break;
6585 case OImode:
6586 /* OImode shouldn't be used directly. */
6587 gcc_unreachable ();
6589 case DFmode:
6590 if (cum->float_in_sse < 2)
6591 break;
6592 case SFmode:
6593 if (cum->float_in_sse < 1)
6594 break;
6595 /* FALLTHRU */
6597 case V8SFmode:
6598 case V8SImode:
6599 case V32QImode:
6600 case V16HImode:
6601 case V4DFmode:
6602 case V4DImode:
6603 case TImode:
6604 case V16QImode:
6605 case V8HImode:
6606 case V4SImode:
6607 case V2DImode:
6608 case V4SFmode:
6609 case V2DFmode:
6610 if (!type || !AGGREGATE_TYPE_P (type))
6612 cum->sse_words += words;
6613 cum->sse_nregs -= 1;
6614 cum->sse_regno += 1;
6615 if (cum->sse_nregs <= 0)
6617 cum->sse_nregs = 0;
6618 cum->sse_regno = 0;
6621 break;
6623 case V8QImode:
6624 case V4HImode:
6625 case V2SImode:
6626 case V2SFmode:
6627 case V1TImode:
6628 case V1DImode:
6629 if (!type || !AGGREGATE_TYPE_P (type))
6631 cum->mmx_words += words;
6632 cum->mmx_nregs -= 1;
6633 cum->mmx_regno += 1;
6634 if (cum->mmx_nregs <= 0)
6636 cum->mmx_nregs = 0;
6637 cum->mmx_regno = 0;
6640 break;
6644 static void
6645 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6646 const_tree type, HOST_WIDE_INT words, bool named)
6648 int int_nregs, sse_nregs;
6650 /* Unnamed 256bit vector mode parameters are passed on stack. */
6651 if (!named && VALID_AVX256_REG_MODE (mode))
6652 return;
6654 if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
6655 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
6657 cum->nregs -= int_nregs;
6658 cum->sse_nregs -= sse_nregs;
6659 cum->regno += int_nregs;
6660 cum->sse_regno += sse_nregs;
6662 else
6664 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
6665 cum->words = (cum->words + align - 1) & ~(align - 1);
6666 cum->words += words;
6670 static void
6671 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
6672 HOST_WIDE_INT words)
6674 /* Otherwise, this should be passed indirect. */
6675 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
6677 cum->words += words;
6678 if (cum->nregs > 0)
6680 cum->nregs -= 1;
6681 cum->regno += 1;
6685 /* Update the data in CUM to advance over an argument of mode MODE and
6686 data type TYPE. (TYPE is null for libcalls where that information
6687 may not be available.) */
6689 static void
6690 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
6691 const_tree type, bool named)
6693 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6694 HOST_WIDE_INT bytes, words;
6696 if (mode == BLKmode)
6697 bytes = int_size_in_bytes (type);
6698 else
6699 bytes = GET_MODE_SIZE (mode);
6700 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6702 if (type)
6703 mode = type_natural_mode (type, NULL);
6705 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6706 function_arg_advance_ms_64 (cum, bytes, words);
6707 else if (TARGET_64BIT)
6708 function_arg_advance_64 (cum, mode, type, words, named);
6709 else
6710 function_arg_advance_32 (cum, mode, type, bytes, words);
6713 /* Define where to put the arguments to a function.
6714 Value is zero to push the argument on the stack,
6715 or a hard register in which to store the argument.
6717 MODE is the argument's machine mode.
6718 TYPE is the data type of the argument (as a tree).
6719 This is null for libcalls where that information may
6720 not be available.
6721 CUM is a variable of type CUMULATIVE_ARGS which gives info about
6722 the preceding args and about the function being called.
6723 NAMED is nonzero if this argument is a named parameter
6724 (otherwise it is an extra parameter matching an ellipsis). */
6726 static rtx
6727 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6728 enum machine_mode orig_mode, const_tree type,
6729 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
6731 static bool warnedsse, warnedmmx;
6733 /* Avoid the AL settings for the Unix64 ABI. */
6734 if (mode == VOIDmode)
6735 return constm1_rtx;
6737 switch (mode)
6739 default:
6740 break;
6742 case BLKmode:
6743 if (bytes < 0)
6744 break;
6745 /* FALLTHRU */
6746 case DImode:
6747 case SImode:
6748 case HImode:
6749 case QImode:
6750 if (words <= cum->nregs)
6752 int regno = cum->regno;
6754 /* Fastcall allocates the first two DWORD (SImode) or
6755 smaller arguments to ECX and EDX if it isn't an
6756 aggregate type . */
6757 if (cum->fastcall)
6759 if (mode == BLKmode
6760 || mode == DImode
6761 || (type && AGGREGATE_TYPE_P (type)))
6762 break;
6764 /* ECX not EAX is the first allocated register. */
6765 if (regno == AX_REG)
6766 regno = CX_REG;
6768 return gen_rtx_REG (mode, regno);
6770 break;
6772 case DFmode:
6773 if (cum->float_in_sse < 2)
6774 break;
6775 case SFmode:
6776 if (cum->float_in_sse < 1)
6777 break;
6778 /* FALLTHRU */
6779 case TImode:
6780 /* In 32bit, we pass TImode in xmm registers. */
6781 case V16QImode:
6782 case V8HImode:
6783 case V4SImode:
6784 case V2DImode:
6785 case V4SFmode:
6786 case V2DFmode:
6787 if (!type || !AGGREGATE_TYPE_P (type))
6789 if (!TARGET_SSE && !warnedsse && cum->warn_sse)
6791 warnedsse = true;
6792 warning (0, "SSE vector argument without SSE enabled "
6793 "changes the ABI");
6795 if (cum->sse_nregs)
6796 return gen_reg_or_parallel (mode, orig_mode,
6797 cum->sse_regno + FIRST_SSE_REG);
6799 break;
6801 case OImode:
6802 /* OImode shouldn't be used directly. */
6803 gcc_unreachable ();
6805 case V8SFmode:
6806 case V8SImode:
6807 case V32QImode:
6808 case V16HImode:
6809 case V4DFmode:
6810 case V4DImode:
6811 if (!type || !AGGREGATE_TYPE_P (type))
6813 if (cum->sse_nregs)
6814 return gen_reg_or_parallel (mode, orig_mode,
6815 cum->sse_regno + FIRST_SSE_REG);
6817 break;
6819 case V8QImode:
6820 case V4HImode:
6821 case V2SImode:
6822 case V2SFmode:
6823 case V1TImode:
6824 case V1DImode:
6825 if (!type || !AGGREGATE_TYPE_P (type))
6827 if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
6829 warnedmmx = true;
6830 warning (0, "MMX vector argument without MMX enabled "
6831 "changes the ABI");
6833 if (cum->mmx_nregs)
6834 return gen_reg_or_parallel (mode, orig_mode,
6835 cum->mmx_regno + FIRST_MMX_REG);
6837 break;
6840 return NULL_RTX;
6843 static rtx
6844 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6845 enum machine_mode orig_mode, const_tree type, bool named)
6847 /* Handle a hidden AL argument containing number of registers
6848 for varargs x86-64 functions. */
6849 if (mode == VOIDmode)
6850 return GEN_INT (cum->maybe_vaarg
6851 ? (cum->sse_nregs < 0
6852 ? X86_64_SSE_REGPARM_MAX
6853 : cum->sse_regno)
6854 : -1);
6856 switch (mode)
6858 default:
6859 break;
6861 case V8SFmode:
6862 case V8SImode:
6863 case V32QImode:
6864 case V16HImode:
6865 case V4DFmode:
6866 case V4DImode:
6867 /* Unnamed 256bit vector mode parameters are passed on stack. */
6868 if (!named)
6869 return NULL;
6870 break;
6873 return construct_container (mode, orig_mode, type, 0, cum->nregs,
6874 cum->sse_nregs,
6875 &x86_64_int_parameter_registers [cum->regno],
6876 cum->sse_regno);
6879 static rtx
6880 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6881 enum machine_mode orig_mode, bool named,
6882 HOST_WIDE_INT bytes)
6884 unsigned int regno;
6886 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
6887 We use value of -2 to specify that current function call is MSABI. */
6888 if (mode == VOIDmode)
6889 return GEN_INT (-2);
6891 /* If we've run out of registers, it goes on the stack. */
6892 if (cum->nregs == 0)
6893 return NULL_RTX;
6895 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
6897 /* Only floating point modes are passed in anything but integer regs. */
6898 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
6900 if (named)
6901 regno = cum->regno + FIRST_SSE_REG;
6902 else
6904 rtx t1, t2;
6906 /* Unnamed floating parameters are passed in both the
6907 SSE and integer registers. */
6908 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
6909 t2 = gen_rtx_REG (mode, regno);
6910 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
6911 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
6912 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
6915 /* Handle aggregated types passed in register. */
6916 if (orig_mode == BLKmode)
6918 if (bytes > 0 && bytes <= 8)
6919 mode = (bytes > 4 ? DImode : SImode);
6920 if (mode == BLKmode)
6921 mode = DImode;
6924 return gen_reg_or_parallel (mode, orig_mode, regno);
6927 /* Return where to put the arguments to a function.
6928 Return zero to push the argument on the stack, or a hard register in which to store the argument.
6930 MODE is the argument's machine mode. TYPE is the data type of the
6931 argument. It is null for libcalls where that information may not be
6932 available. CUM gives information about the preceding args and about
6933 the function being called. NAMED is nonzero if this argument is a
6934 named parameter (otherwise it is an extra parameter matching an
6935 ellipsis). */
6937 static rtx
6938 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
6939 const_tree type, bool named)
6941 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6942 enum machine_mode mode = omode;
6943 HOST_WIDE_INT bytes, words;
6944 rtx arg;
6946 if (mode == BLKmode)
6947 bytes = int_size_in_bytes (type);
6948 else
6949 bytes = GET_MODE_SIZE (mode);
6950 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6952 /* To simplify the code below, represent vector types with a vector mode
6953 even if MMX/SSE are not active. */
6954 if (type && TREE_CODE (type) == VECTOR_TYPE)
6955 mode = type_natural_mode (type, cum);
6957 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6958 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
6959 else if (TARGET_64BIT)
6960 arg = function_arg_64 (cum, mode, omode, type, named);
6961 else
6962 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
6964 return arg;
6967 /* A C expression that indicates when an argument must be passed by
6968 reference. If nonzero for an argument, a copy of that argument is
6969 made in memory and a pointer to the argument is passed instead of
6970 the argument itself. The pointer is passed in whatever way is
6971 appropriate for passing a pointer to that type. */
6973 static bool
6974 ix86_pass_by_reference (cumulative_args_t cum_v ATTRIBUTE_UNUSED,
6975 enum machine_mode mode ATTRIBUTE_UNUSED,
6976 const_tree type, bool named ATTRIBUTE_UNUSED)
6978 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6980 /* See Windows x64 Software Convention. */
6981 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6983 int msize = (int) GET_MODE_SIZE (mode);
6984 if (type)
6986 /* Arrays are passed by reference. */
6987 if (TREE_CODE (type) == ARRAY_TYPE)
6988 return true;
6990 if (AGGREGATE_TYPE_P (type))
6992 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
6993 are passed by reference. */
6994 msize = int_size_in_bytes (type);
6998 /* __m128 is passed by reference. */
6999 switch (msize) {
7000 case 1: case 2: case 4: case 8:
7001 break;
7002 default:
7003 return true;
7006 else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
7007 return 1;
7009 return 0;
7012 /* Return true when TYPE should be 128bit aligned for 32bit argument
7013 passing ABI. XXX: This function is obsolete and is only used for
7014 checking psABI compatibility with previous versions of GCC. */
7016 static bool
7017 ix86_compat_aligned_value_p (const_tree type)
7019 enum machine_mode mode = TYPE_MODE (type);
7020 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
7021 || mode == TDmode
7022 || mode == TFmode
7023 || mode == TCmode)
7024 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
7025 return true;
7026 if (TYPE_ALIGN (type) < 128)
7027 return false;
7029 if (AGGREGATE_TYPE_P (type))
7031 /* Walk the aggregates recursively. */
7032 switch (TREE_CODE (type))
7034 case RECORD_TYPE:
7035 case UNION_TYPE:
7036 case QUAL_UNION_TYPE:
7038 tree field;
7040 /* Walk all the structure fields. */
7041 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7043 if (TREE_CODE (field) == FIELD_DECL
7044 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
7045 return true;
7047 break;
7050 case ARRAY_TYPE:
7051 /* Just for use if some languages passes arrays by value. */
7052 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
7053 return true;
7054 break;
7056 default:
7057 gcc_unreachable ();
7060 return false;
7063 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
7064 XXX: This function is obsolete and is only used for checking psABI
7065 compatibility with previous versions of GCC. */
7067 static unsigned int
7068 ix86_compat_function_arg_boundary (enum machine_mode mode,
7069 const_tree type, unsigned int align)
7071 /* In 32bit, only _Decimal128 and __float128 are aligned to their
7072 natural boundaries. */
7073 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
7075 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
7076 make an exception for SSE modes since these require 128bit
7077 alignment.
7079 The handling here differs from field_alignment. ICC aligns MMX
7080 arguments to 4 byte boundaries, while structure fields are aligned
7081 to 8 byte boundaries. */
7082 if (!type)
7084 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
7085 align = PARM_BOUNDARY;
7087 else
7089 if (!ix86_compat_aligned_value_p (type))
7090 align = PARM_BOUNDARY;
7093 if (align > BIGGEST_ALIGNMENT)
7094 align = BIGGEST_ALIGNMENT;
7095 return align;
7098 /* Return true when TYPE should be 128bit aligned for 32bit argument
7099 passing ABI. */
7101 static bool
7102 ix86_contains_aligned_value_p (const_tree type)
7104 enum machine_mode mode = TYPE_MODE (type);
7106 if (mode == XFmode || mode == XCmode)
7107 return false;
7109 if (TYPE_ALIGN (type) < 128)
7110 return false;
7112 if (AGGREGATE_TYPE_P (type))
7114 /* Walk the aggregates recursively. */
7115 switch (TREE_CODE (type))
7117 case RECORD_TYPE:
7118 case UNION_TYPE:
7119 case QUAL_UNION_TYPE:
7121 tree field;
7123 /* Walk all the structure fields. */
7124 for (field = TYPE_FIELDS (type);
7125 field;
7126 field = DECL_CHAIN (field))
7128 if (TREE_CODE (field) == FIELD_DECL
7129 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
7130 return true;
7132 break;
7135 case ARRAY_TYPE:
7136 /* Just for use if some languages passes arrays by value. */
7137 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
7138 return true;
7139 break;
7141 default:
7142 gcc_unreachable ();
7145 else
7146 return TYPE_ALIGN (type) >= 128;
7148 return false;
7151 /* Gives the alignment boundary, in bits, of an argument with the
7152 specified mode and type. */
7154 static unsigned int
7155 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
7157 unsigned int align;
7158 if (type)
7160 /* Since the main variant type is used for call, we convert it to
7161 the main variant type. */
7162 type = TYPE_MAIN_VARIANT (type);
7163 align = TYPE_ALIGN (type);
7165 else
7166 align = GET_MODE_ALIGNMENT (mode);
7167 if (align < PARM_BOUNDARY)
7168 align = PARM_BOUNDARY;
7169 else
7171 static bool warned;
7172 unsigned int saved_align = align;
7174 if (!TARGET_64BIT)
7176 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
7177 if (!type)
7179 if (mode == XFmode || mode == XCmode)
7180 align = PARM_BOUNDARY;
7182 else if (!ix86_contains_aligned_value_p (type))
7183 align = PARM_BOUNDARY;
7185 if (align < 128)
7186 align = PARM_BOUNDARY;
7189 if (warn_psabi
7190 && !warned
7191 && align != ix86_compat_function_arg_boundary (mode, type,
7192 saved_align))
7194 warned = true;
7195 inform (input_location,
7196 "The ABI for passing parameters with %d-byte"
7197 " alignment has changed in GCC 4.6",
7198 align / BITS_PER_UNIT);
7202 return align;
7205 /* Return true if N is a possible register number of function value. */
7207 static bool
7208 ix86_function_value_regno_p (const unsigned int regno)
7210 switch (regno)
7212 case AX_REG:
7213 return true;
7215 case FIRST_FLOAT_REG:
7216 /* TODO: The function should depend on current function ABI but
7217 builtins.c would need updating then. Therefore we use the
7218 default ABI. */
7219 if (TARGET_64BIT && ix86_abi == MS_ABI)
7220 return false;
7221 return TARGET_FLOAT_RETURNS_IN_80387;
7223 case FIRST_SSE_REG:
7224 return TARGET_SSE;
7226 case FIRST_MMX_REG:
7227 if (TARGET_MACHO || TARGET_64BIT)
7228 return false;
7229 return TARGET_MMX;
7232 return false;
7235 /* Define how to find the value returned by a function.
7236 VALTYPE is the data type of the value (as a tree).
7237 If the precise function being called is known, FUNC is its FUNCTION_DECL;
7238 otherwise, FUNC is 0. */
7240 static rtx
7241 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7242 const_tree fntype, const_tree fn)
7244 unsigned int regno;
7246 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7247 we normally prevent this case when mmx is not available. However
7248 some ABIs may require the result to be returned like DImode. */
7249 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7250 regno = FIRST_MMX_REG;
7252 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
7253 we prevent this case when sse is not available. However some ABIs
7254 may require the result to be returned like integer TImode. */
7255 else if (mode == TImode
7256 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7257 regno = FIRST_SSE_REG;
7259 /* 32-byte vector modes in %ymm0. */
7260 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7261 regno = FIRST_SSE_REG;
7263 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
7264 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7265 regno = FIRST_FLOAT_REG;
7266 else
7267 /* Most things go in %eax. */
7268 regno = AX_REG;
7270 /* Override FP return register with %xmm0 for local functions when
7271 SSE math is enabled or for functions with sseregparm attribute. */
7272 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7274 int sse_level = ix86_function_sseregparm (fntype, fn, false);
7275 if ((sse_level >= 1 && mode == SFmode)
7276 || (sse_level == 2 && mode == DFmode))
7277 regno = FIRST_SSE_REG;
7280 /* OImode shouldn't be used directly. */
7281 gcc_assert (mode != OImode);
7283 return gen_rtx_REG (orig_mode, regno);
7286 static rtx
7287 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7288 const_tree valtype)
7290 rtx ret;
7292 /* Handle libcalls, which don't provide a type node. */
7293 if (valtype == NULL)
7295 unsigned int regno;
7297 switch (mode)
7299 case SFmode:
7300 case SCmode:
7301 case DFmode:
7302 case DCmode:
7303 case TFmode:
7304 case SDmode:
7305 case DDmode:
7306 case TDmode:
7307 regno = FIRST_SSE_REG;
7308 break;
7309 case XFmode:
7310 case XCmode:
7311 regno = FIRST_FLOAT_REG;
7312 break;
7313 case TCmode:
7314 return NULL;
7315 default:
7316 regno = AX_REG;
7319 return gen_rtx_REG (mode, regno);
7321 else if (POINTER_TYPE_P (valtype))
7323 /* Pointers are always returned in word_mode. */
7324 mode = word_mode;
7327 ret = construct_container (mode, orig_mode, valtype, 1,
7328 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7329 x86_64_int_return_registers, 0);
7331 /* For zero sized structures, construct_container returns NULL, but we
7332 need to keep rest of compiler happy by returning meaningful value. */
7333 if (!ret)
7334 ret = gen_rtx_REG (orig_mode, AX_REG);
7336 return ret;
7339 static rtx
7340 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode,
7341 const_tree valtype)
7343 unsigned int regno = AX_REG;
7345 if (TARGET_SSE)
7347 switch (GET_MODE_SIZE (mode))
7349 case 16:
7350 if (valtype != NULL_TREE
7351 && !VECTOR_INTEGER_TYPE_P (valtype)
7352 && !VECTOR_INTEGER_TYPE_P (valtype)
7353 && !INTEGRAL_TYPE_P (valtype)
7354 && !VECTOR_FLOAT_TYPE_P (valtype))
7355 break;
7356 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7357 && !COMPLEX_MODE_P (mode))
7358 regno = FIRST_SSE_REG;
7359 break;
7360 case 8:
7361 case 4:
7362 if (mode == SFmode || mode == DFmode)
7363 regno = FIRST_SSE_REG;
7364 break;
7365 default:
7366 break;
7369 return gen_rtx_REG (orig_mode, regno);
7372 static rtx
7373 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7374 enum machine_mode orig_mode, enum machine_mode mode)
7376 const_tree fn, fntype;
7378 fn = NULL_TREE;
7379 if (fntype_or_decl && DECL_P (fntype_or_decl))
7380 fn = fntype_or_decl;
7381 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7383 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7384 return function_value_ms_64 (orig_mode, mode, valtype);
7385 else if (TARGET_64BIT)
7386 return function_value_64 (orig_mode, mode, valtype);
7387 else
7388 return function_value_32 (orig_mode, mode, fntype, fn);
7391 static rtx
7392 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7393 bool outgoing ATTRIBUTE_UNUSED)
7395 enum machine_mode mode, orig_mode;
7397 orig_mode = TYPE_MODE (valtype);
7398 mode = type_natural_mode (valtype, NULL);
7399 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7402 /* Pointer function arguments and return values are promoted to
7403 word_mode. */
7405 static enum machine_mode
7406 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7407 int *punsignedp, const_tree fntype,
7408 int for_return)
7410 if (type != NULL_TREE && POINTER_TYPE_P (type))
7412 *punsignedp = POINTERS_EXTEND_UNSIGNED;
7413 return word_mode;
7415 return default_promote_function_mode (type, mode, punsignedp, fntype,
7416 for_return);
7419 /* Return true if a structure, union or array with MODE containing FIELD
7420 should be accessed using BLKmode. */
7422 static bool
7423 ix86_member_type_forces_blk (const_tree field, enum machine_mode mode)
7425 /* Union with XFmode must be in BLKmode. */
7426 return (mode == XFmode
7427 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
7428 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
7432 ix86_libcall_value (enum machine_mode mode)
7434 return ix86_function_value_1 (NULL, NULL, mode, mode);
7437 /* Return true iff type is returned in memory. */
7439 static bool ATTRIBUTE_UNUSED
7440 return_in_memory_32 (const_tree type, enum machine_mode mode)
7442 HOST_WIDE_INT size;
7444 if (mode == BLKmode)
7445 return true;
7447 size = int_size_in_bytes (type);
7449 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7450 return false;
7452 if (VECTOR_MODE_P (mode) || mode == TImode)
7454 /* User-created vectors small enough to fit in EAX. */
7455 if (size < 8)
7456 return false;
7458 /* MMX/3dNow values are returned in MM0,
7459 except when it doesn't exits or the ABI prescribes otherwise. */
7460 if (size == 8)
7461 return !TARGET_MMX || TARGET_VECT8_RETURNS;
7463 /* SSE values are returned in XMM0, except when it doesn't exist. */
7464 if (size == 16)
7465 return !TARGET_SSE;
7467 /* AVX values are returned in YMM0, except when it doesn't exist. */
7468 if (size == 32)
7469 return !TARGET_AVX;
7472 if (mode == XFmode)
7473 return false;
7475 if (size > 12)
7476 return true;
7478 /* OImode shouldn't be used directly. */
7479 gcc_assert (mode != OImode);
7481 return false;
7484 static bool ATTRIBUTE_UNUSED
7485 return_in_memory_64 (const_tree type, enum machine_mode mode)
7487 int needed_intregs, needed_sseregs;
7488 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7491 static bool ATTRIBUTE_UNUSED
7492 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7494 HOST_WIDE_INT size = int_size_in_bytes (type);
7496 /* __m128 is returned in xmm0. */
7497 if ((!type || VECTOR_INTEGER_TYPE_P (type) || INTEGRAL_TYPE_P (type)
7498 || VECTOR_FLOAT_TYPE_P (type))
7499 && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7500 && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7501 return false;
7503 /* Otherwise, the size must be exactly in [1248]. */
7504 return size != 1 && size != 2 && size != 4 && size != 8;
7507 static bool
7508 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7510 #ifdef SUBTARGET_RETURN_IN_MEMORY
7511 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7512 #else
7513 const enum machine_mode mode = type_natural_mode (type, NULL);
7515 if (TARGET_64BIT)
7517 if (ix86_function_type_abi (fntype) == MS_ABI)
7518 return return_in_memory_ms_64 (type, mode);
7519 else
7520 return return_in_memory_64 (type, mode);
7522 else
7523 return return_in_memory_32 (type, mode);
7524 #endif
7527 /* When returning SSE vector types, we have a choice of either
7528 (1) being abi incompatible with a -march switch, or
7529 (2) generating an error.
7530 Given no good solution, I think the safest thing is one warning.
7531 The user won't be able to use -Werror, but....
7533 Choose the STRUCT_VALUE_RTX hook because that's (at present) only
7534 called in response to actually generating a caller or callee that
7535 uses such a type. As opposed to TARGET_RETURN_IN_MEMORY, which is called
7536 via aggregate_value_p for general type probing from tree-ssa. */
7538 static rtx
7539 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
7541 static bool warnedsse, warnedmmx;
7543 if (!TARGET_64BIT && type)
7545 /* Look at the return type of the function, not the function type. */
7546 enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
7548 if (!TARGET_SSE && !warnedsse)
7550 if (mode == TImode
7551 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7553 warnedsse = true;
7554 warning (0, "SSE vector return without SSE enabled "
7555 "changes the ABI");
7559 if (!TARGET_MMX && !warnedmmx)
7561 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7563 warnedmmx = true;
7564 warning (0, "MMX vector return without MMX enabled "
7565 "changes the ABI");
7570 return NULL;
7574 /* Create the va_list data type. */
7576 /* Returns the calling convention specific va_list date type.
7577 The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */
7579 static tree
7580 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7582 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7584 /* For i386 we use plain pointer to argument area. */
7585 if (!TARGET_64BIT || abi == MS_ABI)
7586 return build_pointer_type (char_type_node);
7588 record = lang_hooks.types.make_type (RECORD_TYPE);
7589 type_decl = build_decl (BUILTINS_LOCATION,
7590 TYPE_DECL, get_identifier ("__va_list_tag"), record);
7592 f_gpr = build_decl (BUILTINS_LOCATION,
7593 FIELD_DECL, get_identifier ("gp_offset"),
7594 unsigned_type_node);
7595 f_fpr = build_decl (BUILTINS_LOCATION,
7596 FIELD_DECL, get_identifier ("fp_offset"),
7597 unsigned_type_node);
7598 f_ovf = build_decl (BUILTINS_LOCATION,
7599 FIELD_DECL, get_identifier ("overflow_arg_area"),
7600 ptr_type_node);
7601 f_sav = build_decl (BUILTINS_LOCATION,
7602 FIELD_DECL, get_identifier ("reg_save_area"),
7603 ptr_type_node);
7605 va_list_gpr_counter_field = f_gpr;
7606 va_list_fpr_counter_field = f_fpr;
7608 DECL_FIELD_CONTEXT (f_gpr) = record;
7609 DECL_FIELD_CONTEXT (f_fpr) = record;
7610 DECL_FIELD_CONTEXT (f_ovf) = record;
7611 DECL_FIELD_CONTEXT (f_sav) = record;
7613 TYPE_STUB_DECL (record) = type_decl;
7614 TYPE_NAME (record) = type_decl;
7615 TYPE_FIELDS (record) = f_gpr;
7616 DECL_CHAIN (f_gpr) = f_fpr;
7617 DECL_CHAIN (f_fpr) = f_ovf;
7618 DECL_CHAIN (f_ovf) = f_sav;
7620 layout_type (record);
7622 /* The correct type is an array type of one element. */
7623 return build_array_type (record, build_index_type (size_zero_node));
7626 /* Setup the builtin va_list data type and for 64-bit the additional
7627 calling convention specific va_list data types. */
7629 static tree
7630 ix86_build_builtin_va_list (void)
7632 tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
7634 /* Initialize abi specific va_list builtin types. */
7635 if (TARGET_64BIT)
7637 tree t;
7638 if (ix86_abi == MS_ABI)
7640 t = ix86_build_builtin_va_list_abi (SYSV_ABI);
7641 if (TREE_CODE (t) != RECORD_TYPE)
7642 t = build_variant_type_copy (t);
7643 sysv_va_list_type_node = t;
7645 else
7647 t = ret;
7648 if (TREE_CODE (t) != RECORD_TYPE)
7649 t = build_variant_type_copy (t);
7650 sysv_va_list_type_node = t;
7652 if (ix86_abi != MS_ABI)
7654 t = ix86_build_builtin_va_list_abi (MS_ABI);
7655 if (TREE_CODE (t) != RECORD_TYPE)
7656 t = build_variant_type_copy (t);
7657 ms_va_list_type_node = t;
7659 else
7661 t = ret;
7662 if (TREE_CODE (t) != RECORD_TYPE)
7663 t = build_variant_type_copy (t);
7664 ms_va_list_type_node = t;
7668 return ret;
7671 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
7673 static void
7674 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
7676 rtx save_area, mem;
7677 alias_set_type set;
7678 int i, max;
7680 /* GPR size of varargs save area. */
7681 if (cfun->va_list_gpr_size)
7682 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
7683 else
7684 ix86_varargs_gpr_size = 0;
7686 /* FPR size of varargs save area. We don't need it if we don't pass
7687 anything in SSE registers. */
7688 if (TARGET_SSE && cfun->va_list_fpr_size)
7689 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
7690 else
7691 ix86_varargs_fpr_size = 0;
7693 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
7694 return;
7696 save_area = frame_pointer_rtx;
7697 set = get_varargs_alias_set ();
7699 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
7700 if (max > X86_64_REGPARM_MAX)
7701 max = X86_64_REGPARM_MAX;
7703 for (i = cum->regno; i < max; i++)
7705 mem = gen_rtx_MEM (word_mode,
7706 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
7707 MEM_NOTRAP_P (mem) = 1;
7708 set_mem_alias_set (mem, set);
7709 emit_move_insn (mem,
7710 gen_rtx_REG (word_mode,
7711 x86_64_int_parameter_registers[i]));
7714 if (ix86_varargs_fpr_size)
7716 enum machine_mode smode;
7717 rtx label, test;
7719 /* Now emit code to save SSE registers. The AX parameter contains number
7720 of SSE parameter registers used to call this function, though all we
7721 actually check here is the zero/non-zero status. */
7723 label = gen_label_rtx ();
7724 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
7725 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
7726 label));
7728 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
7729 we used movdqa (i.e. TImode) instead? Perhaps even better would
7730 be if we could determine the real mode of the data, via a hook
7731 into pass_stdarg. Ignore all that for now. */
7732 smode = V4SFmode;
7733 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
7734 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
7736 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
7737 if (max > X86_64_SSE_REGPARM_MAX)
7738 max = X86_64_SSE_REGPARM_MAX;
7740 for (i = cum->sse_regno; i < max; ++i)
7742 mem = plus_constant (Pmode, save_area,
7743 i * 16 + ix86_varargs_gpr_size);
7744 mem = gen_rtx_MEM (smode, mem);
7745 MEM_NOTRAP_P (mem) = 1;
7746 set_mem_alias_set (mem, set);
7747 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
7749 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
7752 emit_label (label);
7756 static void
7757 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
7759 alias_set_type set = get_varargs_alias_set ();
7760 int i;
7762 /* Reset to zero, as there might be a sysv vaarg used
7763 before. */
7764 ix86_varargs_gpr_size = 0;
7765 ix86_varargs_fpr_size = 0;
7767 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
7769 rtx reg, mem;
7771 mem = gen_rtx_MEM (Pmode,
7772 plus_constant (Pmode, virtual_incoming_args_rtx,
7773 i * UNITS_PER_WORD));
7774 MEM_NOTRAP_P (mem) = 1;
7775 set_mem_alias_set (mem, set);
7777 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
7778 emit_move_insn (mem, reg);
7782 static void
7783 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
7784 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7785 int no_rtl)
7787 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7788 CUMULATIVE_ARGS next_cum;
7789 tree fntype;
7791 /* This argument doesn't appear to be used anymore. Which is good,
7792 because the old code here didn't suppress rtl generation. */
7793 gcc_assert (!no_rtl);
7795 if (!TARGET_64BIT)
7796 return;
7798 fntype = TREE_TYPE (current_function_decl);
7800 /* For varargs, we do not want to skip the dummy va_dcl argument.
7801 For stdargs, we do want to skip the last named argument. */
7802 next_cum = *cum;
7803 if (stdarg_p (fntype))
7804 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
7805 true);
7807 if (cum->call_abi == MS_ABI)
7808 setup_incoming_varargs_ms_64 (&next_cum);
7809 else
7810 setup_incoming_varargs_64 (&next_cum);
7813 /* Checks if TYPE is of kind va_list char *. */
7815 static bool
7816 is_va_list_char_pointer (tree type)
7818 tree canonic;
7820 /* For 32-bit it is always true. */
7821 if (!TARGET_64BIT)
7822 return true;
7823 canonic = ix86_canonical_va_list_type (type);
7824 return (canonic == ms_va_list_type_node
7825 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
7828 /* Implement va_start. */
7830 static void
7831 ix86_va_start (tree valist, rtx nextarg)
7833 HOST_WIDE_INT words, n_gpr, n_fpr;
7834 tree f_gpr, f_fpr, f_ovf, f_sav;
7835 tree gpr, fpr, ovf, sav, t;
7836 tree type;
7837 rtx ovf_rtx;
7839 if (flag_split_stack
7840 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7842 unsigned int scratch_regno;
7844 /* When we are splitting the stack, we can't refer to the stack
7845 arguments using internal_arg_pointer, because they may be on
7846 the old stack. The split stack prologue will arrange to
7847 leave a pointer to the old stack arguments in a scratch
7848 register, which we here copy to a pseudo-register. The split
7849 stack prologue can't set the pseudo-register directly because
7850 it (the prologue) runs before any registers have been saved. */
7852 scratch_regno = split_stack_prologue_scratch_regno ();
7853 if (scratch_regno != INVALID_REGNUM)
7855 rtx reg, seq;
7857 reg = gen_reg_rtx (Pmode);
7858 cfun->machine->split_stack_varargs_pointer = reg;
7860 start_sequence ();
7861 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
7862 seq = get_insns ();
7863 end_sequence ();
7865 push_topmost_sequence ();
7866 emit_insn_after (seq, entry_of_function ());
7867 pop_topmost_sequence ();
7871 /* Only 64bit target needs something special. */
7872 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7874 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7875 std_expand_builtin_va_start (valist, nextarg);
7876 else
7878 rtx va_r, next;
7880 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
7881 next = expand_binop (ptr_mode, add_optab,
7882 cfun->machine->split_stack_varargs_pointer,
7883 crtl->args.arg_offset_rtx,
7884 NULL_RTX, 0, OPTAB_LIB_WIDEN);
7885 convert_move (va_r, next, 0);
7887 return;
7890 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7891 f_fpr = DECL_CHAIN (f_gpr);
7892 f_ovf = DECL_CHAIN (f_fpr);
7893 f_sav = DECL_CHAIN (f_ovf);
7895 valist = build_simple_mem_ref (valist);
7896 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
7897 /* The following should be folded into the MEM_REF offset. */
7898 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
7899 f_gpr, NULL_TREE);
7900 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
7901 f_fpr, NULL_TREE);
7902 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
7903 f_ovf, NULL_TREE);
7904 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
7905 f_sav, NULL_TREE);
7907 /* Count number of gp and fp argument registers used. */
7908 words = crtl->args.info.words;
7909 n_gpr = crtl->args.info.regno;
7910 n_fpr = crtl->args.info.sse_regno;
7912 if (cfun->va_list_gpr_size)
7914 type = TREE_TYPE (gpr);
7915 t = build2 (MODIFY_EXPR, type,
7916 gpr, build_int_cst (type, n_gpr * 8));
7917 TREE_SIDE_EFFECTS (t) = 1;
7918 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7921 if (TARGET_SSE && cfun->va_list_fpr_size)
7923 type = TREE_TYPE (fpr);
7924 t = build2 (MODIFY_EXPR, type, fpr,
7925 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
7926 TREE_SIDE_EFFECTS (t) = 1;
7927 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7930 /* Find the overflow area. */
7931 type = TREE_TYPE (ovf);
7932 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7933 ovf_rtx = crtl->args.internal_arg_pointer;
7934 else
7935 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
7936 t = make_tree (type, ovf_rtx);
7937 if (words != 0)
7938 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
7939 t = build2 (MODIFY_EXPR, type, ovf, t);
7940 TREE_SIDE_EFFECTS (t) = 1;
7941 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7943 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
7945 /* Find the register save area.
7946 Prologue of the function save it right above stack frame. */
7947 type = TREE_TYPE (sav);
7948 t = make_tree (type, frame_pointer_rtx);
7949 if (!ix86_varargs_gpr_size)
7950 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
7951 t = build2 (MODIFY_EXPR, type, sav, t);
7952 TREE_SIDE_EFFECTS (t) = 1;
7953 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7957 /* Implement va_arg. */
7959 static tree
7960 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
7961 gimple_seq *post_p)
7963 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
7964 tree f_gpr, f_fpr, f_ovf, f_sav;
7965 tree gpr, fpr, ovf, sav, t;
7966 int size, rsize;
7967 tree lab_false, lab_over = NULL_TREE;
7968 tree addr, t2;
7969 rtx container;
7970 int indirect_p = 0;
7971 tree ptrtype;
7972 enum machine_mode nat_mode;
7973 unsigned int arg_boundary;
7975 /* Only 64bit target needs something special. */
7976 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7977 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
7979 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7980 f_fpr = DECL_CHAIN (f_gpr);
7981 f_ovf = DECL_CHAIN (f_fpr);
7982 f_sav = DECL_CHAIN (f_ovf);
7984 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
7985 build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
7986 valist = build_va_arg_indirect_ref (valist);
7987 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
7988 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
7989 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
7991 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7992 if (indirect_p)
7993 type = build_pointer_type (type);
7994 size = int_size_in_bytes (type);
7995 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7997 nat_mode = type_natural_mode (type, NULL);
7998 switch (nat_mode)
8000 case V8SFmode:
8001 case V8SImode:
8002 case V32QImode:
8003 case V16HImode:
8004 case V4DFmode:
8005 case V4DImode:
8006 /* Unnamed 256bit vector mode parameters are passed on stack. */
8007 if (!TARGET_64BIT_MS_ABI)
8009 container = NULL;
8010 break;
8013 default:
8014 container = construct_container (nat_mode, TYPE_MODE (type),
8015 type, 0, X86_64_REGPARM_MAX,
8016 X86_64_SSE_REGPARM_MAX, intreg,
8018 break;
8021 /* Pull the value out of the saved registers. */
8023 addr = create_tmp_var (ptr_type_node, "addr");
8025 if (container)
8027 int needed_intregs, needed_sseregs;
8028 bool need_temp;
8029 tree int_addr, sse_addr;
8031 lab_false = create_artificial_label (UNKNOWN_LOCATION);
8032 lab_over = create_artificial_label (UNKNOWN_LOCATION);
8034 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
8036 need_temp = (!REG_P (container)
8037 && ((needed_intregs && TYPE_ALIGN (type) > 64)
8038 || TYPE_ALIGN (type) > 128));
8040 /* In case we are passing structure, verify that it is consecutive block
8041 on the register save area. If not we need to do moves. */
8042 if (!need_temp && !REG_P (container))
8044 /* Verify that all registers are strictly consecutive */
8045 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
8047 int i;
8049 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8051 rtx slot = XVECEXP (container, 0, i);
8052 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
8053 || INTVAL (XEXP (slot, 1)) != i * 16)
8054 need_temp = 1;
8057 else
8059 int i;
8061 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8063 rtx slot = XVECEXP (container, 0, i);
8064 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
8065 || INTVAL (XEXP (slot, 1)) != i * 8)
8066 need_temp = 1;
8070 if (!need_temp)
8072 int_addr = addr;
8073 sse_addr = addr;
8075 else
8077 int_addr = create_tmp_var (ptr_type_node, "int_addr");
8078 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
8081 /* First ensure that we fit completely in registers. */
8082 if (needed_intregs)
8084 t = build_int_cst (TREE_TYPE (gpr),
8085 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
8086 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
8087 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8088 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8089 gimplify_and_add (t, pre_p);
8091 if (needed_sseregs)
8093 t = build_int_cst (TREE_TYPE (fpr),
8094 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
8095 + X86_64_REGPARM_MAX * 8);
8096 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
8097 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8098 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8099 gimplify_and_add (t, pre_p);
8102 /* Compute index to start of area used for integer regs. */
8103 if (needed_intregs)
8105 /* int_addr = gpr + sav; */
8106 t = fold_build_pointer_plus (sav, gpr);
8107 gimplify_assign (int_addr, t, pre_p);
8109 if (needed_sseregs)
8111 /* sse_addr = fpr + sav; */
8112 t = fold_build_pointer_plus (sav, fpr);
8113 gimplify_assign (sse_addr, t, pre_p);
8115 if (need_temp)
8117 int i, prev_size = 0;
8118 tree temp = create_tmp_var (type, "va_arg_tmp");
8120 /* addr = &temp; */
8121 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
8122 gimplify_assign (addr, t, pre_p);
8124 for (i = 0; i < XVECLEN (container, 0); i++)
8126 rtx slot = XVECEXP (container, 0, i);
8127 rtx reg = XEXP (slot, 0);
8128 enum machine_mode mode = GET_MODE (reg);
8129 tree piece_type;
8130 tree addr_type;
8131 tree daddr_type;
8132 tree src_addr, src;
8133 int src_offset;
8134 tree dest_addr, dest;
8135 int cur_size = GET_MODE_SIZE (mode);
8137 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
8138 prev_size = INTVAL (XEXP (slot, 1));
8139 if (prev_size + cur_size > size)
8141 cur_size = size - prev_size;
8142 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
8143 if (mode == BLKmode)
8144 mode = QImode;
8146 piece_type = lang_hooks.types.type_for_mode (mode, 1);
8147 if (mode == GET_MODE (reg))
8148 addr_type = build_pointer_type (piece_type);
8149 else
8150 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8151 true);
8152 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8153 true);
8155 if (SSE_REGNO_P (REGNO (reg)))
8157 src_addr = sse_addr;
8158 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
8160 else
8162 src_addr = int_addr;
8163 src_offset = REGNO (reg) * 8;
8165 src_addr = fold_convert (addr_type, src_addr);
8166 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
8168 dest_addr = fold_convert (daddr_type, addr);
8169 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
8170 if (cur_size == GET_MODE_SIZE (mode))
8172 src = build_va_arg_indirect_ref (src_addr);
8173 dest = build_va_arg_indirect_ref (dest_addr);
8175 gimplify_assign (dest, src, pre_p);
8177 else
8179 tree copy
8180 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
8181 3, dest_addr, src_addr,
8182 size_int (cur_size));
8183 gimplify_and_add (copy, pre_p);
8185 prev_size += cur_size;
8189 if (needed_intregs)
8191 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
8192 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
8193 gimplify_assign (gpr, t, pre_p);
8196 if (needed_sseregs)
8198 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
8199 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
8200 gimplify_assign (fpr, t, pre_p);
8203 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
8205 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
8208 /* ... otherwise out of the overflow area. */
8210 /* When we align parameter on stack for caller, if the parameter
8211 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
8212 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
8213 here with caller. */
8214 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
8215 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
8216 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
8218 /* Care for on-stack alignment if needed. */
8219 if (arg_boundary <= 64 || size == 0)
8220 t = ovf;
8221 else
8223 HOST_WIDE_INT align = arg_boundary / 8;
8224 t = fold_build_pointer_plus_hwi (ovf, align - 1);
8225 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8226 build_int_cst (TREE_TYPE (t), -align));
8229 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
8230 gimplify_assign (addr, t, pre_p);
8232 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
8233 gimplify_assign (unshare_expr (ovf), t, pre_p);
8235 if (container)
8236 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
8238 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
8239 addr = fold_convert (ptrtype, addr);
8241 if (indirect_p)
8242 addr = build_va_arg_indirect_ref (addr);
8243 return build_va_arg_indirect_ref (addr);
8246 /* Return true if OPNUM's MEM should be matched
8247 in movabs* patterns. */
8249 bool
8250 ix86_check_movabs (rtx insn, int opnum)
8252 rtx set, mem;
8254 set = PATTERN (insn);
8255 if (GET_CODE (set) == PARALLEL)
8256 set = XVECEXP (set, 0, 0);
8257 gcc_assert (GET_CODE (set) == SET);
8258 mem = XEXP (set, opnum);
8259 while (GET_CODE (mem) == SUBREG)
8260 mem = SUBREG_REG (mem);
8261 gcc_assert (MEM_P (mem));
8262 return volatile_ok || !MEM_VOLATILE_P (mem);
8265 /* Initialize the table of extra 80387 mathematical constants. */
8267 static void
8268 init_ext_80387_constants (void)
8270 static const char * cst[5] =
8272 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
8273 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
8274 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
8275 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
8276 "3.1415926535897932385128089594061862044", /* 4: fldpi */
8278 int i;
8280 for (i = 0; i < 5; i++)
8282 real_from_string (&ext_80387_constants_table[i], cst[i]);
8283 /* Ensure each constant is rounded to XFmode precision. */
8284 real_convert (&ext_80387_constants_table[i],
8285 XFmode, &ext_80387_constants_table[i]);
8288 ext_80387_constants_init = 1;
8291 /* Return non-zero if the constant is something that
8292 can be loaded with a special instruction. */
8295 standard_80387_constant_p (rtx x)
8297 enum machine_mode mode = GET_MODE (x);
8299 REAL_VALUE_TYPE r;
8301 if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8302 return -1;
8304 if (x == CONST0_RTX (mode))
8305 return 1;
8306 if (x == CONST1_RTX (mode))
8307 return 2;
8309 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8311 /* For XFmode constants, try to find a special 80387 instruction when
8312 optimizing for size or on those CPUs that benefit from them. */
8313 if (mode == XFmode
8314 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8316 int i;
8318 if (! ext_80387_constants_init)
8319 init_ext_80387_constants ();
8321 for (i = 0; i < 5; i++)
8322 if (real_identical (&r, &ext_80387_constants_table[i]))
8323 return i + 3;
8326 /* Load of the constant -0.0 or -1.0 will be split as
8327 fldz;fchs or fld1;fchs sequence. */
8328 if (real_isnegzero (&r))
8329 return 8;
8330 if (real_identical (&r, &dconstm1))
8331 return 9;
8333 return 0;
8336 /* Return the opcode of the special instruction to be used to load
8337 the constant X. */
8339 const char *
8340 standard_80387_constant_opcode (rtx x)
8342 switch (standard_80387_constant_p (x))
8344 case 1:
8345 return "fldz";
8346 case 2:
8347 return "fld1";
8348 case 3:
8349 return "fldlg2";
8350 case 4:
8351 return "fldln2";
8352 case 5:
8353 return "fldl2e";
8354 case 6:
8355 return "fldl2t";
8356 case 7:
8357 return "fldpi";
8358 case 8:
8359 case 9:
8360 return "#";
8361 default:
8362 gcc_unreachable ();
8366 /* Return the CONST_DOUBLE representing the 80387 constant that is
8367 loaded by the specified special instruction. The argument IDX
8368 matches the return value from standard_80387_constant_p. */
8371 standard_80387_constant_rtx (int idx)
8373 int i;
8375 if (! ext_80387_constants_init)
8376 init_ext_80387_constants ();
8378 switch (idx)
8380 case 3:
8381 case 4:
8382 case 5:
8383 case 6:
8384 case 7:
8385 i = idx - 3;
8386 break;
8388 default:
8389 gcc_unreachable ();
8392 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8393 XFmode);
8396 /* Return 1 if X is all 0s and 2 if x is all 1s
8397 in supported SSE/AVX vector mode. */
8400 standard_sse_constant_p (rtx x)
8402 enum machine_mode mode = GET_MODE (x);
8404 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8405 return 1;
8406 if (vector_all_ones_operand (x, mode))
8407 switch (mode)
8409 case V16QImode:
8410 case V8HImode:
8411 case V4SImode:
8412 case V2DImode:
8413 if (TARGET_SSE2)
8414 return 2;
8415 case V32QImode:
8416 case V16HImode:
8417 case V8SImode:
8418 case V4DImode:
8419 if (TARGET_AVX2)
8420 return 2;
8421 default:
8422 break;
8425 return 0;
8428 /* Return the opcode of the special instruction to be used to load
8429 the constant X. */
8431 const char *
8432 standard_sse_constant_opcode (rtx insn, rtx x)
8434 switch (standard_sse_constant_p (x))
8436 case 1:
8437 switch (get_attr_mode (insn))
8439 case MODE_TI:
8440 return "%vpxor\t%0, %d0";
8441 case MODE_V2DF:
8442 return "%vxorpd\t%0, %d0";
8443 case MODE_V4SF:
8444 return "%vxorps\t%0, %d0";
8446 case MODE_OI:
8447 return "vpxor\t%x0, %x0, %x0";
8448 case MODE_V4DF:
8449 return "vxorpd\t%x0, %x0, %x0";
8450 case MODE_V8SF:
8451 return "vxorps\t%x0, %x0, %x0";
8453 default:
8454 break;
8457 case 2:
8458 if (TARGET_AVX)
8459 return "vpcmpeqd\t%0, %0, %0";
8460 else
8461 return "pcmpeqd\t%0, %0";
8463 default:
8464 break;
8466 gcc_unreachable ();
8469 /* Returns true if OP contains a symbol reference */
8471 bool
8472 symbolic_reference_mentioned_p (rtx op)
8474 const char *fmt;
8475 int i;
8477 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8478 return true;
8480 fmt = GET_RTX_FORMAT (GET_CODE (op));
8481 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8483 if (fmt[i] == 'E')
8485 int j;
8487 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8488 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8489 return true;
8492 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8493 return true;
8496 return false;
8499 /* Return true if it is appropriate to emit `ret' instructions in the
8500 body of a function. Do this only if the epilogue is simple, needing a
8501 couple of insns. Prior to reloading, we can't tell how many registers
8502 must be saved, so return false then. Return false if there is no frame
8503 marker to de-allocate. */
8505 bool
8506 ix86_can_use_return_insn_p (void)
8508 struct ix86_frame frame;
8510 if (! reload_completed || frame_pointer_needed)
8511 return 0;
8513 /* Don't allow more than 32k pop, since that's all we can do
8514 with one instruction. */
8515 if (crtl->args.pops_args && crtl->args.size >= 32768)
8516 return 0;
8518 ix86_compute_frame_layout (&frame);
8519 return (frame.stack_pointer_offset == UNITS_PER_WORD
8520 && (frame.nregs + frame.nsseregs) == 0);
8523 /* Value should be nonzero if functions must have frame pointers.
8524 Zero means the frame pointer need not be set up (and parms may
8525 be accessed via the stack pointer) in functions that seem suitable. */
8527 static bool
8528 ix86_frame_pointer_required (void)
8530 /* If we accessed previous frames, then the generated code expects
8531 to be able to access the saved ebp value in our frame. */
8532 if (cfun->machine->accesses_prev_frame)
8533 return true;
8535 /* Several x86 os'es need a frame pointer for other reasons,
8536 usually pertaining to setjmp. */
8537 if (SUBTARGET_FRAME_POINTER_REQUIRED)
8538 return true;
8540 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
8541 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8542 return true;
8544 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
8545 allocation is 4GB. */
8546 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
8547 return true;
8549 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8550 turns off the frame pointer by default. Turn it back on now if
8551 we've not got a leaf function. */
8552 if (TARGET_OMIT_LEAF_FRAME_POINTER
8553 && (!crtl->is_leaf
8554 || ix86_current_function_calls_tls_descriptor))
8555 return true;
8557 if (crtl->profile && !flag_fentry)
8558 return true;
8560 return false;
8563 /* Record that the current function accesses previous call frames. */
8565 void
8566 ix86_setup_frame_addresses (void)
8568 cfun->machine->accesses_prev_frame = 1;
8571 #ifndef USE_HIDDEN_LINKONCE
8572 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8573 # define USE_HIDDEN_LINKONCE 1
8574 # else
8575 # define USE_HIDDEN_LINKONCE 0
8576 # endif
8577 #endif
8579 static int pic_labels_used;
8581 /* Fills in the label name that should be used for a pc thunk for
8582 the given register. */
8584 static void
8585 get_pc_thunk_name (char name[32], unsigned int regno)
8587 gcc_assert (!TARGET_64BIT);
8589 if (USE_HIDDEN_LINKONCE)
8590 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
8591 else
8592 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8596 /* This function generates code for -fpic that loads %ebx with
8597 the return address of the caller and then returns. */
8599 static void
8600 ix86_code_end (void)
8602 rtx xops[2];
8603 int regno;
8605 for (regno = AX_REG; regno <= SP_REG; regno++)
8607 char name[32];
8608 tree decl;
8610 if (!(pic_labels_used & (1 << regno)))
8611 continue;
8613 get_pc_thunk_name (name, regno);
8615 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
8616 get_identifier (name),
8617 build_function_type_list (void_type_node, NULL_TREE));
8618 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
8619 NULL_TREE, void_type_node);
8620 TREE_PUBLIC (decl) = 1;
8621 TREE_STATIC (decl) = 1;
8622 DECL_IGNORED_P (decl) = 1;
8624 #if TARGET_MACHO
8625 if (TARGET_MACHO)
8627 switch_to_section (darwin_sections[text_coal_section]);
8628 fputs ("\t.weak_definition\t", asm_out_file);
8629 assemble_name (asm_out_file, name);
8630 fputs ("\n\t.private_extern\t", asm_out_file);
8631 assemble_name (asm_out_file, name);
8632 putc ('\n', asm_out_file);
8633 ASM_OUTPUT_LABEL (asm_out_file, name);
8634 DECL_WEAK (decl) = 1;
8636 else
8637 #endif
8638 if (USE_HIDDEN_LINKONCE)
8640 DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
8642 targetm.asm_out.unique_section (decl, 0);
8643 switch_to_section (get_named_section (decl, NULL, 0));
8645 targetm.asm_out.globalize_label (asm_out_file, name);
8646 fputs ("\t.hidden\t", asm_out_file);
8647 assemble_name (asm_out_file, name);
8648 putc ('\n', asm_out_file);
8649 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
8651 else
8653 switch_to_section (text_section);
8654 ASM_OUTPUT_LABEL (asm_out_file, name);
8657 DECL_INITIAL (decl) = make_node (BLOCK);
8658 current_function_decl = decl;
8659 init_function_start (decl);
8660 first_function_block_is_cold = false;
8661 /* Make sure unwind info is emitted for the thunk if needed. */
8662 final_start_function (emit_barrier (), asm_out_file, 1);
8664 /* Pad stack IP move with 4 instructions (two NOPs count
8665 as one instruction). */
8666 if (TARGET_PAD_SHORT_FUNCTION)
8668 int i = 8;
8670 while (i--)
8671 fputs ("\tnop\n", asm_out_file);
8674 xops[0] = gen_rtx_REG (Pmode, regno);
8675 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
8676 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
8677 fputs ("\tret\n", asm_out_file);
8678 final_end_function ();
8679 init_insn_lengths ();
8680 free_after_compilation (cfun);
8681 set_cfun (NULL);
8682 current_function_decl = NULL;
8685 if (flag_split_stack)
8686 file_end_indicate_split_stack ();
8689 /* Emit code for the SET_GOT patterns. */
8691 const char *
8692 output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
8694 rtx xops[3];
8696 xops[0] = dest;
8698 if (TARGET_VXWORKS_RTP && flag_pic)
8700 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
8701 xops[2] = gen_rtx_MEM (Pmode,
8702 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
8703 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
8705 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
8706 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
8707 an unadorned address. */
8708 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
8709 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
8710 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
8711 return "";
8714 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
8716 if (!flag_pic)
8718 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
8720 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
8722 #if TARGET_MACHO
8723 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8724 is what will be referenced by the Mach-O PIC subsystem. */
8725 if (!label)
8726 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8727 #endif
8729 targetm.asm_out.internal_label (asm_out_file, "L",
8730 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
8732 else
8734 char name[32];
8735 get_pc_thunk_name (name, REGNO (dest));
8736 pic_labels_used |= 1 << REGNO (dest);
8738 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
8739 xops[2] = gen_rtx_MEM (QImode, xops[2]);
8740 output_asm_insn ("call\t%X2", xops);
8741 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8742 is what will be referenced by the Mach-O PIC subsystem. */
8743 #if TARGET_MACHO
8744 if (!label)
8745 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8746 else
8747 targetm.asm_out.internal_label (asm_out_file, "L",
8748 CODE_LABEL_NUMBER (label));
8749 #endif
8752 if (!TARGET_MACHO)
8753 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
8755 return "";
8758 /* Generate an "push" pattern for input ARG. */
8760 static rtx
8761 gen_push (rtx arg)
8763 struct machine_function *m = cfun->machine;
8765 if (m->fs.cfa_reg == stack_pointer_rtx)
8766 m->fs.cfa_offset += UNITS_PER_WORD;
8767 m->fs.sp_offset += UNITS_PER_WORD;
8769 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8770 arg = gen_rtx_REG (word_mode, REGNO (arg));
8772 return gen_rtx_SET (VOIDmode,
8773 gen_rtx_MEM (word_mode,
8774 gen_rtx_PRE_DEC (Pmode,
8775 stack_pointer_rtx)),
8776 arg);
8779 /* Generate an "pop" pattern for input ARG. */
8781 static rtx
8782 gen_pop (rtx arg)
8784 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8785 arg = gen_rtx_REG (word_mode, REGNO (arg));
8787 return gen_rtx_SET (VOIDmode,
8788 arg,
8789 gen_rtx_MEM (word_mode,
8790 gen_rtx_POST_INC (Pmode,
8791 stack_pointer_rtx)));
8794 /* Return >= 0 if there is an unused call-clobbered register available
8795 for the entire function. */
8797 static unsigned int
8798 ix86_select_alt_pic_regnum (void)
8800 if (crtl->is_leaf
8801 && !crtl->profile
8802 && !ix86_current_function_calls_tls_descriptor)
8804 int i, drap;
8805 /* Can't use the same register for both PIC and DRAP. */
8806 if (crtl->drap_reg)
8807 drap = REGNO (crtl->drap_reg);
8808 else
8809 drap = -1;
8810 for (i = 2; i >= 0; --i)
8811 if (i != drap && !df_regs_ever_live_p (i))
8812 return i;
8815 return INVALID_REGNUM;
8818 /* Return TRUE if we need to save REGNO. */
8820 static bool
8821 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
8823 if (pic_offset_table_rtx
8824 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
8825 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
8826 || crtl->profile
8827 || crtl->calls_eh_return
8828 || crtl->uses_const_pool))
8829 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
8831 if (crtl->calls_eh_return && maybe_eh_return)
8833 unsigned i;
8834 for (i = 0; ; i++)
8836 unsigned test = EH_RETURN_DATA_REGNO (i);
8837 if (test == INVALID_REGNUM)
8838 break;
8839 if (test == regno)
8840 return true;
8844 if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
8845 return true;
8847 return (df_regs_ever_live_p (regno)
8848 && !call_used_regs[regno]
8849 && !fixed_regs[regno]
8850 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
8853 /* Return number of saved general prupose registers. */
8855 static int
8856 ix86_nsaved_regs (void)
8858 int nregs = 0;
8859 int regno;
8861 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8862 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8863 nregs ++;
8864 return nregs;
8867 /* Return number of saved SSE registrers. */
8869 static int
8870 ix86_nsaved_sseregs (void)
8872 int nregs = 0;
8873 int regno;
8875 if (!TARGET_64BIT_MS_ABI)
8876 return 0;
8877 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8878 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8879 nregs ++;
8880 return nregs;
8883 /* Given FROM and TO register numbers, say whether this elimination is
8884 allowed. If stack alignment is needed, we can only replace argument
8885 pointer with hard frame pointer, or replace frame pointer with stack
8886 pointer. Otherwise, frame pointer elimination is automatically
8887 handled and all other eliminations are valid. */
8889 static bool
8890 ix86_can_eliminate (const int from, const int to)
8892 if (stack_realign_fp)
8893 return ((from == ARG_POINTER_REGNUM
8894 && to == HARD_FRAME_POINTER_REGNUM)
8895 || (from == FRAME_POINTER_REGNUM
8896 && to == STACK_POINTER_REGNUM));
8897 else
8898 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
8901 /* Return the offset between two registers, one to be eliminated, and the other
8902 its replacement, at the start of a routine. */
8904 HOST_WIDE_INT
8905 ix86_initial_elimination_offset (int from, int to)
8907 struct ix86_frame frame;
8908 ix86_compute_frame_layout (&frame);
8910 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
8911 return frame.hard_frame_pointer_offset;
8912 else if (from == FRAME_POINTER_REGNUM
8913 && to == HARD_FRAME_POINTER_REGNUM)
8914 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
8915 else
8917 gcc_assert (to == STACK_POINTER_REGNUM);
8919 if (from == ARG_POINTER_REGNUM)
8920 return frame.stack_pointer_offset;
8922 gcc_assert (from == FRAME_POINTER_REGNUM);
8923 return frame.stack_pointer_offset - frame.frame_pointer_offset;
8927 /* In a dynamically-aligned function, we can't know the offset from
8928 stack pointer to frame pointer, so we must ensure that setjmp
8929 eliminates fp against the hard fp (%ebp) rather than trying to
8930 index from %esp up to the top of the frame across a gap that is
8931 of unknown (at compile-time) size. */
8932 static rtx
8933 ix86_builtin_setjmp_frame_value (void)
8935 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
8938 /* When using -fsplit-stack, the allocation routines set a field in
8939 the TCB to the bottom of the stack plus this much space, measured
8940 in bytes. */
8942 #define SPLIT_STACK_AVAILABLE 256
8944 /* Fill structure ix86_frame about frame of currently computed function. */
8946 static void
8947 ix86_compute_frame_layout (struct ix86_frame *frame)
8949 unsigned HOST_WIDE_INT stack_alignment_needed;
8950 HOST_WIDE_INT offset;
8951 unsigned HOST_WIDE_INT preferred_alignment;
8952 HOST_WIDE_INT size = get_frame_size ();
8953 HOST_WIDE_INT to_allocate;
8955 frame->nregs = ix86_nsaved_regs ();
8956 frame->nsseregs = ix86_nsaved_sseregs ();
8958 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
8959 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
8961 /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
8962 function prologues and leaf. */
8963 if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
8964 && (!crtl->is_leaf || cfun->calls_alloca != 0
8965 || ix86_current_function_calls_tls_descriptor))
8967 preferred_alignment = 16;
8968 stack_alignment_needed = 16;
8969 crtl->preferred_stack_boundary = 128;
8970 crtl->stack_alignment_needed = 128;
8973 gcc_assert (!size || stack_alignment_needed);
8974 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
8975 gcc_assert (preferred_alignment <= stack_alignment_needed);
8977 /* For SEH we have to limit the amount of code movement into the prologue.
8978 At present we do this via a BLOCKAGE, at which point there's very little
8979 scheduling that can be done, which means that there's very little point
8980 in doing anything except PUSHs. */
8981 if (TARGET_SEH)
8982 cfun->machine->use_fast_prologue_epilogue = false;
8984 /* During reload iteration the amount of registers saved can change.
8985 Recompute the value as needed. Do not recompute when amount of registers
8986 didn't change as reload does multiple calls to the function and does not
8987 expect the decision to change within single iteration. */
8988 else if (!optimize_function_for_size_p (cfun)
8989 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
8991 int count = frame->nregs;
8992 struct cgraph_node *node = cgraph_get_node (current_function_decl);
8994 cfun->machine->use_fast_prologue_epilogue_nregs = count;
8996 /* The fast prologue uses move instead of push to save registers. This
8997 is significantly longer, but also executes faster as modern hardware
8998 can execute the moves in parallel, but can't do that for push/pop.
9000 Be careful about choosing what prologue to emit: When function takes
9001 many instructions to execute we may use slow version as well as in
9002 case function is known to be outside hot spot (this is known with
9003 feedback only). Weight the size of function by number of registers
9004 to save as it is cheap to use one or two push instructions but very
9005 slow to use many of them. */
9006 if (count)
9007 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
9008 if (node->frequency < NODE_FREQUENCY_NORMAL
9009 || (flag_branch_probabilities
9010 && node->frequency < NODE_FREQUENCY_HOT))
9011 cfun->machine->use_fast_prologue_epilogue = false;
9012 else
9013 cfun->machine->use_fast_prologue_epilogue
9014 = !expensive_function_p (count);
9017 frame->save_regs_using_mov
9018 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
9019 /* If static stack checking is enabled and done with probes,
9020 the registers need to be saved before allocating the frame. */
9021 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
9023 /* Skip return address. */
9024 offset = UNITS_PER_WORD;
9026 /* Skip pushed static chain. */
9027 if (ix86_static_chain_on_stack)
9028 offset += UNITS_PER_WORD;
9030 /* Skip saved base pointer. */
9031 if (frame_pointer_needed)
9032 offset += UNITS_PER_WORD;
9033 frame->hfp_save_offset = offset;
9035 /* The traditional frame pointer location is at the top of the frame. */
9036 frame->hard_frame_pointer_offset = offset;
9038 /* Register save area */
9039 offset += frame->nregs * UNITS_PER_WORD;
9040 frame->reg_save_offset = offset;
9042 /* On SEH target, registers are pushed just before the frame pointer
9043 location. */
9044 if (TARGET_SEH)
9045 frame->hard_frame_pointer_offset = offset;
9047 /* Align and set SSE register save area. */
9048 if (frame->nsseregs)
9050 /* The only ABI that has saved SSE registers (Win64) also has a
9051 16-byte aligned default stack, and thus we don't need to be
9052 within the re-aligned local stack frame to save them. */
9053 gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
9054 offset = (offset + 16 - 1) & -16;
9055 offset += frame->nsseregs * 16;
9057 frame->sse_reg_save_offset = offset;
9059 /* The re-aligned stack starts here. Values before this point are not
9060 directly comparable with values below this point. In order to make
9061 sure that no value happens to be the same before and after, force
9062 the alignment computation below to add a non-zero value. */
9063 if (stack_realign_fp)
9064 offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
9066 /* Va-arg area */
9067 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
9068 offset += frame->va_arg_size;
9070 /* Align start of frame for local function. */
9071 if (stack_realign_fp
9072 || offset != frame->sse_reg_save_offset
9073 || size != 0
9074 || !crtl->is_leaf
9075 || cfun->calls_alloca
9076 || ix86_current_function_calls_tls_descriptor)
9077 offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
9079 /* Frame pointer points here. */
9080 frame->frame_pointer_offset = offset;
9082 offset += size;
9084 /* Add outgoing arguments area. Can be skipped if we eliminated
9085 all the function calls as dead code.
9086 Skipping is however impossible when function calls alloca. Alloca
9087 expander assumes that last crtl->outgoing_args_size
9088 of stack frame are unused. */
9089 if (ACCUMULATE_OUTGOING_ARGS
9090 && (!crtl->is_leaf || cfun->calls_alloca
9091 || ix86_current_function_calls_tls_descriptor))
9093 offset += crtl->outgoing_args_size;
9094 frame->outgoing_arguments_size = crtl->outgoing_args_size;
9096 else
9097 frame->outgoing_arguments_size = 0;
9099 /* Align stack boundary. Only needed if we're calling another function
9100 or using alloca. */
9101 if (!crtl->is_leaf || cfun->calls_alloca
9102 || ix86_current_function_calls_tls_descriptor)
9103 offset = (offset + preferred_alignment - 1) & -preferred_alignment;
9105 /* We've reached end of stack frame. */
9106 frame->stack_pointer_offset = offset;
9108 /* Size prologue needs to allocate. */
9109 to_allocate = offset - frame->sse_reg_save_offset;
9111 if ((!to_allocate && frame->nregs <= 1)
9112 || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
9113 frame->save_regs_using_mov = false;
9115 if (ix86_using_red_zone ()
9116 && crtl->sp_is_unchanging
9117 && crtl->is_leaf
9118 && !ix86_current_function_calls_tls_descriptor)
9120 frame->red_zone_size = to_allocate;
9121 if (frame->save_regs_using_mov)
9122 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
9123 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
9124 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
9126 else
9127 frame->red_zone_size = 0;
9128 frame->stack_pointer_offset -= frame->red_zone_size;
9130 /* The SEH frame pointer location is near the bottom of the frame.
9131 This is enforced by the fact that the difference between the
9132 stack pointer and the frame pointer is limited to 240 bytes in
9133 the unwind data structure. */
9134 if (TARGET_SEH)
9136 HOST_WIDE_INT diff;
9138 /* If we can leave the frame pointer where it is, do so. Also, returns
9139 the establisher frame for __builtin_frame_address (0). */
9140 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
9141 if (diff <= SEH_MAX_FRAME_SIZE
9142 && (diff > 240 || (diff & 15) != 0)
9143 && !crtl->accesses_prior_frames)
9145 /* Ideally we'd determine what portion of the local stack frame
9146 (within the constraint of the lowest 240) is most heavily used.
9147 But without that complication, simply bias the frame pointer
9148 by 128 bytes so as to maximize the amount of the local stack
9149 frame that is addressable with 8-bit offsets. */
9150 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
9155 /* This is semi-inlined memory_address_length, but simplified
9156 since we know that we're always dealing with reg+offset, and
9157 to avoid having to create and discard all that rtl. */
9159 static inline int
9160 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
9162 int len = 4;
9164 if (offset == 0)
9166 /* EBP and R13 cannot be encoded without an offset. */
9167 len = (regno == BP_REG || regno == R13_REG);
9169 else if (IN_RANGE (offset, -128, 127))
9170 len = 1;
9172 /* ESP and R12 must be encoded with a SIB byte. */
9173 if (regno == SP_REG || regno == R12_REG)
9174 len++;
9176 return len;
9179 /* Return an RTX that points to CFA_OFFSET within the stack frame.
9180 The valid base registers are taken from CFUN->MACHINE->FS. */
9182 static rtx
9183 choose_baseaddr (HOST_WIDE_INT cfa_offset)
9185 const struct machine_function *m = cfun->machine;
9186 rtx base_reg = NULL;
9187 HOST_WIDE_INT base_offset = 0;
9189 if (m->use_fast_prologue_epilogue)
9191 /* Choose the base register most likely to allow the most scheduling
9192 opportunities. Generally FP is valid throughout the function,
9193 while DRAP must be reloaded within the epilogue. But choose either
9194 over the SP due to increased encoding size. */
9196 if (m->fs.fp_valid)
9198 base_reg = hard_frame_pointer_rtx;
9199 base_offset = m->fs.fp_offset - cfa_offset;
9201 else if (m->fs.drap_valid)
9203 base_reg = crtl->drap_reg;
9204 base_offset = 0 - cfa_offset;
9206 else if (m->fs.sp_valid)
9208 base_reg = stack_pointer_rtx;
9209 base_offset = m->fs.sp_offset - cfa_offset;
9212 else
9214 HOST_WIDE_INT toffset;
9215 int len = 16, tlen;
9217 /* Choose the base register with the smallest address encoding.
9218 With a tie, choose FP > DRAP > SP. */
9219 if (m->fs.sp_valid)
9221 base_reg = stack_pointer_rtx;
9222 base_offset = m->fs.sp_offset - cfa_offset;
9223 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
9225 if (m->fs.drap_valid)
9227 toffset = 0 - cfa_offset;
9228 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
9229 if (tlen <= len)
9231 base_reg = crtl->drap_reg;
9232 base_offset = toffset;
9233 len = tlen;
9236 if (m->fs.fp_valid)
9238 toffset = m->fs.fp_offset - cfa_offset;
9239 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
9240 if (tlen <= len)
9242 base_reg = hard_frame_pointer_rtx;
9243 base_offset = toffset;
9244 len = tlen;
9248 gcc_assert (base_reg != NULL);
9250 return plus_constant (Pmode, base_reg, base_offset);
9253 /* Emit code to save registers in the prologue. */
9255 static void
9256 ix86_emit_save_regs (void)
9258 unsigned int regno;
9259 rtx insn;
9261 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9262 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9264 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
9265 RTX_FRAME_RELATED_P (insn) = 1;
9269 /* Emit a single register save at CFA - CFA_OFFSET. */
9271 static void
9272 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9273 HOST_WIDE_INT cfa_offset)
9275 struct machine_function *m = cfun->machine;
9276 rtx reg = gen_rtx_REG (mode, regno);
9277 rtx mem, addr, base, insn;
9279 addr = choose_baseaddr (cfa_offset);
9280 mem = gen_frame_mem (mode, addr);
9282 /* For SSE saves, we need to indicate the 128-bit alignment. */
9283 set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9285 insn = emit_move_insn (mem, reg);
9286 RTX_FRAME_RELATED_P (insn) = 1;
9288 base = addr;
9289 if (GET_CODE (base) == PLUS)
9290 base = XEXP (base, 0);
9291 gcc_checking_assert (REG_P (base));
9293 /* When saving registers into a re-aligned local stack frame, avoid
9294 any tricky guessing by dwarf2out. */
9295 if (m->fs.realigned)
9297 gcc_checking_assert (stack_realign_drap);
9299 if (regno == REGNO (crtl->drap_reg))
9301 /* A bit of a hack. We force the DRAP register to be saved in
9302 the re-aligned stack frame, which provides us with a copy
9303 of the CFA that will last past the prologue. Install it. */
9304 gcc_checking_assert (cfun->machine->fs.fp_valid);
9305 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9306 cfun->machine->fs.fp_offset - cfa_offset);
9307 mem = gen_rtx_MEM (mode, addr);
9308 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9310 else
9312 /* The frame pointer is a stable reference within the
9313 aligned frame. Use it. */
9314 gcc_checking_assert (cfun->machine->fs.fp_valid);
9315 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9316 cfun->machine->fs.fp_offset - cfa_offset);
9317 mem = gen_rtx_MEM (mode, addr);
9318 add_reg_note (insn, REG_CFA_EXPRESSION,
9319 gen_rtx_SET (VOIDmode, mem, reg));
9323 /* The memory may not be relative to the current CFA register,
9324 which means that we may need to generate a new pattern for
9325 use by the unwind info. */
9326 else if (base != m->fs.cfa_reg)
9328 addr = plus_constant (Pmode, m->fs.cfa_reg,
9329 m->fs.cfa_offset - cfa_offset);
9330 mem = gen_rtx_MEM (mode, addr);
9331 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9335 /* Emit code to save registers using MOV insns.
9336 First register is stored at CFA - CFA_OFFSET. */
9337 static void
9338 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9340 unsigned int regno;
9342 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9343 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9345 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
9346 cfa_offset -= UNITS_PER_WORD;
9350 /* Emit code to save SSE registers using MOV insns.
9351 First register is stored at CFA - CFA_OFFSET. */
9352 static void
9353 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9355 unsigned int regno;
9357 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9358 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9360 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9361 cfa_offset -= 16;
9365 static GTY(()) rtx queued_cfa_restores;
9367 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9368 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
9369 Don't add the note if the previously saved value will be left untouched
9370 within stack red-zone till return, as unwinders can find the same value
9371 in the register and on the stack. */
9373 static void
9374 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9376 if (!crtl->shrink_wrapped
9377 && cfa_offset <= cfun->machine->fs.red_zone_offset)
9378 return;
9380 if (insn)
9382 add_reg_note (insn, REG_CFA_RESTORE, reg);
9383 RTX_FRAME_RELATED_P (insn) = 1;
9385 else
9386 queued_cfa_restores
9387 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9390 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
9392 static void
9393 ix86_add_queued_cfa_restore_notes (rtx insn)
9395 rtx last;
9396 if (!queued_cfa_restores)
9397 return;
9398 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9400 XEXP (last, 1) = REG_NOTES (insn);
9401 REG_NOTES (insn) = queued_cfa_restores;
9402 queued_cfa_restores = NULL_RTX;
9403 RTX_FRAME_RELATED_P (insn) = 1;
9406 /* Expand prologue or epilogue stack adjustment.
9407 The pattern exist to put a dependency on all ebp-based memory accesses.
9408 STYLE should be negative if instructions should be marked as frame related,
9409 zero if %r11 register is live and cannot be freely used and positive
9410 otherwise. */
9412 static void
9413 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9414 int style, bool set_cfa)
9416 struct machine_function *m = cfun->machine;
9417 rtx insn;
9418 bool add_frame_related_expr = false;
9420 if (Pmode == SImode)
9421 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9422 else if (x86_64_immediate_operand (offset, DImode))
9423 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9424 else
9426 rtx tmp;
9427 /* r11 is used by indirect sibcall return as well, set before the
9428 epilogue and used after the epilogue. */
9429 if (style)
9430 tmp = gen_rtx_REG (DImode, R11_REG);
9431 else
9433 gcc_assert (src != hard_frame_pointer_rtx
9434 && dest != hard_frame_pointer_rtx);
9435 tmp = hard_frame_pointer_rtx;
9437 insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9438 if (style < 0)
9439 add_frame_related_expr = true;
9441 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9444 insn = emit_insn (insn);
9445 if (style >= 0)
9446 ix86_add_queued_cfa_restore_notes (insn);
9448 if (set_cfa)
9450 rtx r;
9452 gcc_assert (m->fs.cfa_reg == src);
9453 m->fs.cfa_offset += INTVAL (offset);
9454 m->fs.cfa_reg = dest;
9456 r = gen_rtx_PLUS (Pmode, src, offset);
9457 r = gen_rtx_SET (VOIDmode, dest, r);
9458 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9459 RTX_FRAME_RELATED_P (insn) = 1;
9461 else if (style < 0)
9463 RTX_FRAME_RELATED_P (insn) = 1;
9464 if (add_frame_related_expr)
9466 rtx r = gen_rtx_PLUS (Pmode, src, offset);
9467 r = gen_rtx_SET (VOIDmode, dest, r);
9468 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9472 if (dest == stack_pointer_rtx)
9474 HOST_WIDE_INT ooffset = m->fs.sp_offset;
9475 bool valid = m->fs.sp_valid;
9477 if (src == hard_frame_pointer_rtx)
9479 valid = m->fs.fp_valid;
9480 ooffset = m->fs.fp_offset;
9482 else if (src == crtl->drap_reg)
9484 valid = m->fs.drap_valid;
9485 ooffset = 0;
9487 else
9489 /* Else there are two possibilities: SP itself, which we set
9490 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
9491 taken care of this by hand along the eh_return path. */
9492 gcc_checking_assert (src == stack_pointer_rtx
9493 || offset == const0_rtx);
9496 m->fs.sp_offset = ooffset - INTVAL (offset);
9497 m->fs.sp_valid = valid;
9501 /* Find an available register to be used as dynamic realign argument
9502 pointer regsiter. Such a register will be written in prologue and
9503 used in begin of body, so it must not be
9504 1. parameter passing register.
9505 2. GOT pointer.
9506 We reuse static-chain register if it is available. Otherwise, we
9507 use DI for i386 and R13 for x86-64. We chose R13 since it has
9508 shorter encoding.
9510 Return: the regno of chosen register. */
9512 static unsigned int
9513 find_drap_reg (void)
9515 tree decl = cfun->decl;
9517 if (TARGET_64BIT)
9519 /* Use R13 for nested function or function need static chain.
9520 Since function with tail call may use any caller-saved
9521 registers in epilogue, DRAP must not use caller-saved
9522 register in such case. */
9523 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9524 return R13_REG;
9526 return R10_REG;
9528 else
9530 /* Use DI for nested function or function need static chain.
9531 Since function with tail call may use any caller-saved
9532 registers in epilogue, DRAP must not use caller-saved
9533 register in such case. */
9534 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9535 return DI_REG;
9537 /* Reuse static chain register if it isn't used for parameter
9538 passing. */
9539 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9541 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9542 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9543 return CX_REG;
9545 return DI_REG;
9549 /* Return minimum incoming stack alignment. */
9551 static unsigned int
9552 ix86_minimum_incoming_stack_boundary (bool sibcall)
9554 unsigned int incoming_stack_boundary;
9556 /* Prefer the one specified at command line. */
9557 if (ix86_user_incoming_stack_boundary)
9558 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9559 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9560 if -mstackrealign is used, it isn't used for sibcall check and
9561 estimated stack alignment is 128bit. */
9562 else if (!sibcall
9563 && !TARGET_64BIT
9564 && ix86_force_align_arg_pointer
9565 && crtl->stack_alignment_estimated == 128)
9566 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9567 else
9568 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9570 /* Incoming stack alignment can be changed on individual functions
9571 via force_align_arg_pointer attribute. We use the smallest
9572 incoming stack boundary. */
9573 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9574 && lookup_attribute (ix86_force_align_arg_pointer_string,
9575 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9576 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9578 /* The incoming stack frame has to be aligned at least at
9579 parm_stack_boundary. */
9580 if (incoming_stack_boundary < crtl->parm_stack_boundary)
9581 incoming_stack_boundary = crtl->parm_stack_boundary;
9583 /* Stack at entrance of main is aligned by runtime. We use the
9584 smallest incoming stack boundary. */
9585 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9586 && DECL_NAME (current_function_decl)
9587 && MAIN_NAME_P (DECL_NAME (current_function_decl))
9588 && DECL_FILE_SCOPE_P (current_function_decl))
9589 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9591 return incoming_stack_boundary;
9594 /* Update incoming stack boundary and estimated stack alignment. */
9596 static void
9597 ix86_update_stack_boundary (void)
9599 ix86_incoming_stack_boundary
9600 = ix86_minimum_incoming_stack_boundary (false);
9602 /* x86_64 vararg needs 16byte stack alignment for register save
9603 area. */
9604 if (TARGET_64BIT
9605 && cfun->stdarg
9606 && crtl->stack_alignment_estimated < 128)
9607 crtl->stack_alignment_estimated = 128;
9610 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
9611 needed or an rtx for DRAP otherwise. */
9613 static rtx
9614 ix86_get_drap_rtx (void)
9616 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
9617 crtl->need_drap = true;
9619 if (stack_realign_drap)
9621 /* Assign DRAP to vDRAP and returns vDRAP */
9622 unsigned int regno = find_drap_reg ();
9623 rtx drap_vreg;
9624 rtx arg_ptr;
9625 rtx seq, insn;
9627 arg_ptr = gen_rtx_REG (Pmode, regno);
9628 crtl->drap_reg = arg_ptr;
9630 start_sequence ();
9631 drap_vreg = copy_to_reg (arg_ptr);
9632 seq = get_insns ();
9633 end_sequence ();
9635 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
9636 if (!optimize)
9638 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
9639 RTX_FRAME_RELATED_P (insn) = 1;
9641 return drap_vreg;
9643 else
9644 return NULL;
9647 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
9649 static rtx
9650 ix86_internal_arg_pointer (void)
9652 return virtual_incoming_args_rtx;
9655 struct scratch_reg {
9656 rtx reg;
9657 bool saved;
9660 /* Return a short-lived scratch register for use on function entry.
9661 In 32-bit mode, it is valid only after the registers are saved
9662 in the prologue. This register must be released by means of
9663 release_scratch_register_on_entry once it is dead. */
9665 static void
9666 get_scratch_register_on_entry (struct scratch_reg *sr)
9668 int regno;
9670 sr->saved = false;
9672 if (TARGET_64BIT)
9674 /* We always use R11 in 64-bit mode. */
9675 regno = R11_REG;
9677 else
9679 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
9680 bool fastcall_p
9681 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9682 bool thiscall_p
9683 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9684 bool static_chain_p = DECL_STATIC_CHAIN (decl);
9685 int regparm = ix86_function_regparm (fntype, decl);
9686 int drap_regno
9687 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
9689 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
9690 for the static chain register. */
9691 if ((regparm < 1 || (fastcall_p && !static_chain_p))
9692 && drap_regno != AX_REG)
9693 regno = AX_REG;
9694 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
9695 for the static chain register. */
9696 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
9697 regno = AX_REG;
9698 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
9699 regno = DX_REG;
9700 /* ecx is the static chain register. */
9701 else if (regparm < 3 && !fastcall_p && !thiscall_p
9702 && !static_chain_p
9703 && drap_regno != CX_REG)
9704 regno = CX_REG;
9705 else if (ix86_save_reg (BX_REG, true))
9706 regno = BX_REG;
9707 /* esi is the static chain register. */
9708 else if (!(regparm == 3 && static_chain_p)
9709 && ix86_save_reg (SI_REG, true))
9710 regno = SI_REG;
9711 else if (ix86_save_reg (DI_REG, true))
9712 regno = DI_REG;
9713 else
9715 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
9716 sr->saved = true;
9720 sr->reg = gen_rtx_REG (Pmode, regno);
9721 if (sr->saved)
9723 rtx insn = emit_insn (gen_push (sr->reg));
9724 RTX_FRAME_RELATED_P (insn) = 1;
9728 /* Release a scratch register obtained from the preceding function. */
9730 static void
9731 release_scratch_register_on_entry (struct scratch_reg *sr)
9733 if (sr->saved)
9735 struct machine_function *m = cfun->machine;
9736 rtx x, insn = emit_insn (gen_pop (sr->reg));
9738 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
9739 RTX_FRAME_RELATED_P (insn) = 1;
9740 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
9741 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
9742 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
9743 m->fs.sp_offset -= UNITS_PER_WORD;
9747 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
9749 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
9751 static void
9752 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
9754 /* We skip the probe for the first interval + a small dope of 4 words and
9755 probe that many bytes past the specified size to maintain a protection
9756 area at the botton of the stack. */
9757 const int dope = 4 * UNITS_PER_WORD;
9758 rtx size_rtx = GEN_INT (size), last;
9760 /* See if we have a constant small number of probes to generate. If so,
9761 that's the easy case. The run-time loop is made up of 11 insns in the
9762 generic case while the compile-time loop is made up of 3+2*(n-1) insns
9763 for n # of intervals. */
9764 if (size <= 5 * PROBE_INTERVAL)
9766 HOST_WIDE_INT i, adjust;
9767 bool first_probe = true;
9769 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
9770 values of N from 1 until it exceeds SIZE. If only one probe is
9771 needed, this will not generate any code. Then adjust and probe
9772 to PROBE_INTERVAL + SIZE. */
9773 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9775 if (first_probe)
9777 adjust = 2 * PROBE_INTERVAL + dope;
9778 first_probe = false;
9780 else
9781 adjust = PROBE_INTERVAL;
9783 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9784 plus_constant (Pmode, stack_pointer_rtx,
9785 -adjust)));
9786 emit_stack_probe (stack_pointer_rtx);
9789 if (first_probe)
9790 adjust = size + PROBE_INTERVAL + dope;
9791 else
9792 adjust = size + PROBE_INTERVAL - i;
9794 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9795 plus_constant (Pmode, stack_pointer_rtx,
9796 -adjust)));
9797 emit_stack_probe (stack_pointer_rtx);
9799 /* Adjust back to account for the additional first interval. */
9800 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9801 plus_constant (Pmode, stack_pointer_rtx,
9802 PROBE_INTERVAL + dope)));
9805 /* Otherwise, do the same as above, but in a loop. Note that we must be
9806 extra careful with variables wrapping around because we might be at
9807 the very top (or the very bottom) of the address space and we have
9808 to be able to handle this case properly; in particular, we use an
9809 equality test for the loop condition. */
9810 else
9812 HOST_WIDE_INT rounded_size;
9813 struct scratch_reg sr;
9815 get_scratch_register_on_entry (&sr);
9818 /* Step 1: round SIZE to the previous multiple of the interval. */
9820 rounded_size = size & -PROBE_INTERVAL;
9823 /* Step 2: compute initial and final value of the loop counter. */
9825 /* SP = SP_0 + PROBE_INTERVAL. */
9826 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9827 plus_constant (Pmode, stack_pointer_rtx,
9828 - (PROBE_INTERVAL + dope))));
9830 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
9831 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
9832 emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
9833 gen_rtx_PLUS (Pmode, sr.reg,
9834 stack_pointer_rtx)));
9837 /* Step 3: the loop
9839 while (SP != LAST_ADDR)
9841 SP = SP + PROBE_INTERVAL
9842 probe at SP
9845 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
9846 values of N from 1 until it is equal to ROUNDED_SIZE. */
9848 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
9851 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
9852 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
9854 if (size != rounded_size)
9856 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9857 plus_constant (Pmode, stack_pointer_rtx,
9858 rounded_size - size)));
9859 emit_stack_probe (stack_pointer_rtx);
9862 /* Adjust back to account for the additional first interval. */
9863 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9864 plus_constant (Pmode, stack_pointer_rtx,
9865 PROBE_INTERVAL + dope)));
9867 release_scratch_register_on_entry (&sr);
9870 gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
9872 /* Even if the stack pointer isn't the CFA register, we need to correctly
9873 describe the adjustments made to it, in particular differentiate the
9874 frame-related ones from the frame-unrelated ones. */
9875 if (size > 0)
9877 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
9878 XVECEXP (expr, 0, 0)
9879 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9880 plus_constant (Pmode, stack_pointer_rtx, -size));
9881 XVECEXP (expr, 0, 1)
9882 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9883 plus_constant (Pmode, stack_pointer_rtx,
9884 PROBE_INTERVAL + dope + size));
9885 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
9886 RTX_FRAME_RELATED_P (last) = 1;
9888 cfun->machine->fs.sp_offset += size;
9891 /* Make sure nothing is scheduled before we are done. */
9892 emit_insn (gen_blockage ());
9895 /* Adjust the stack pointer up to REG while probing it. */
9897 const char *
9898 output_adjust_stack_and_probe (rtx reg)
9900 static int labelno = 0;
9901 char loop_lab[32], end_lab[32];
9902 rtx xops[2];
9904 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9905 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9907 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9909 /* Jump to END_LAB if SP == LAST_ADDR. */
9910 xops[0] = stack_pointer_rtx;
9911 xops[1] = reg;
9912 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9913 fputs ("\tje\t", asm_out_file);
9914 assemble_name_raw (asm_out_file, end_lab);
9915 fputc ('\n', asm_out_file);
9917 /* SP = SP + PROBE_INTERVAL. */
9918 xops[1] = GEN_INT (PROBE_INTERVAL);
9919 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9921 /* Probe at SP. */
9922 xops[1] = const0_rtx;
9923 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
9925 fprintf (asm_out_file, "\tjmp\t");
9926 assemble_name_raw (asm_out_file, loop_lab);
9927 fputc ('\n', asm_out_file);
9929 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9931 return "";
9934 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
9935 inclusive. These are offsets from the current stack pointer. */
9937 static void
9938 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
9940 /* See if we have a constant small number of probes to generate. If so,
9941 that's the easy case. The run-time loop is made up of 7 insns in the
9942 generic case while the compile-time loop is made up of n insns for n #
9943 of intervals. */
9944 if (size <= 7 * PROBE_INTERVAL)
9946 HOST_WIDE_INT i;
9948 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
9949 it exceeds SIZE. If only one probe is needed, this will not
9950 generate any code. Then probe at FIRST + SIZE. */
9951 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9952 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9953 -(first + i)));
9955 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9956 -(first + size)));
9959 /* Otherwise, do the same as above, but in a loop. Note that we must be
9960 extra careful with variables wrapping around because we might be at
9961 the very top (or the very bottom) of the address space and we have
9962 to be able to handle this case properly; in particular, we use an
9963 equality test for the loop condition. */
9964 else
9966 HOST_WIDE_INT rounded_size, last;
9967 struct scratch_reg sr;
9969 get_scratch_register_on_entry (&sr);
9972 /* Step 1: round SIZE to the previous multiple of the interval. */
9974 rounded_size = size & -PROBE_INTERVAL;
9977 /* Step 2: compute initial and final value of the loop counter. */
9979 /* TEST_OFFSET = FIRST. */
9980 emit_move_insn (sr.reg, GEN_INT (-first));
9982 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
9983 last = first + rounded_size;
9986 /* Step 3: the loop
9988 while (TEST_ADDR != LAST_ADDR)
9990 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
9991 probe at TEST_ADDR
9994 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
9995 until it is equal to ROUNDED_SIZE. */
9997 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
10000 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
10001 that SIZE is equal to ROUNDED_SIZE. */
10003 if (size != rounded_size)
10004 emit_stack_probe (plus_constant (Pmode,
10005 gen_rtx_PLUS (Pmode,
10006 stack_pointer_rtx,
10007 sr.reg),
10008 rounded_size - size));
10010 release_scratch_register_on_entry (&sr);
10013 /* Make sure nothing is scheduled before we are done. */
10014 emit_insn (gen_blockage ());
10017 /* Probe a range of stack addresses from REG to END, inclusive. These are
10018 offsets from the current stack pointer. */
10020 const char *
10021 output_probe_stack_range (rtx reg, rtx end)
10023 static int labelno = 0;
10024 char loop_lab[32], end_lab[32];
10025 rtx xops[3];
10027 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10028 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10030 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10032 /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */
10033 xops[0] = reg;
10034 xops[1] = end;
10035 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10036 fputs ("\tje\t", asm_out_file);
10037 assemble_name_raw (asm_out_file, end_lab);
10038 fputc ('\n', asm_out_file);
10040 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
10041 xops[1] = GEN_INT (PROBE_INTERVAL);
10042 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10044 /* Probe at TEST_ADDR. */
10045 xops[0] = stack_pointer_rtx;
10046 xops[1] = reg;
10047 xops[2] = const0_rtx;
10048 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
10050 fprintf (asm_out_file, "\tjmp\t");
10051 assemble_name_raw (asm_out_file, loop_lab);
10052 fputc ('\n', asm_out_file);
10054 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10056 return "";
10059 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
10060 to be generated in correct form. */
10061 static void
10062 ix86_finalize_stack_realign_flags (void)
10064 /* Check if stack realign is really needed after reload, and
10065 stores result in cfun */
10066 unsigned int incoming_stack_boundary
10067 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
10068 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
10069 unsigned int stack_realign = (incoming_stack_boundary
10070 < (crtl->is_leaf
10071 ? crtl->max_used_stack_slot_alignment
10072 : crtl->stack_alignment_needed));
10074 if (crtl->stack_realign_finalized)
10076 /* After stack_realign_needed is finalized, we can't no longer
10077 change it. */
10078 gcc_assert (crtl->stack_realign_needed == stack_realign);
10079 return;
10082 /* If the only reason for frame_pointer_needed is that we conservatively
10083 assumed stack realignment might be needed, but in the end nothing that
10084 needed the stack alignment had been spilled, clear frame_pointer_needed
10085 and say we don't need stack realignment. */
10086 if (stack_realign
10087 && !crtl->need_drap
10088 && frame_pointer_needed
10089 && crtl->is_leaf
10090 && flag_omit_frame_pointer
10091 && crtl->sp_is_unchanging
10092 && !ix86_current_function_calls_tls_descriptor
10093 && !crtl->accesses_prior_frames
10094 && !cfun->calls_alloca
10095 && !crtl->calls_eh_return
10096 && !(flag_stack_check && STACK_CHECK_MOVING_SP)
10097 && !ix86_frame_pointer_required ()
10098 && get_frame_size () == 0
10099 && ix86_nsaved_sseregs () == 0
10100 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
10102 HARD_REG_SET set_up_by_prologue, prologue_used;
10103 basic_block bb;
10105 CLEAR_HARD_REG_SET (prologue_used);
10106 CLEAR_HARD_REG_SET (set_up_by_prologue);
10107 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
10108 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
10109 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
10110 HARD_FRAME_POINTER_REGNUM);
10111 FOR_EACH_BB (bb)
10113 rtx insn;
10114 FOR_BB_INSNS (bb, insn)
10115 if (NONDEBUG_INSN_P (insn)
10116 && requires_stack_frame_p (insn, prologue_used,
10117 set_up_by_prologue))
10119 crtl->stack_realign_needed = stack_realign;
10120 crtl->stack_realign_finalized = true;
10121 return;
10125 frame_pointer_needed = false;
10126 stack_realign = false;
10127 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
10128 crtl->stack_alignment_needed = incoming_stack_boundary;
10129 crtl->stack_alignment_estimated = incoming_stack_boundary;
10130 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
10131 crtl->preferred_stack_boundary = incoming_stack_boundary;
10132 df_finish_pass (true);
10133 df_scan_alloc (NULL);
10134 df_scan_blocks ();
10135 df_compute_regs_ever_live (true);
10136 df_analyze ();
10139 crtl->stack_realign_needed = stack_realign;
10140 crtl->stack_realign_finalized = true;
10143 /* Expand the prologue into a bunch of separate insns. */
10145 void
10146 ix86_expand_prologue (void)
10148 struct machine_function *m = cfun->machine;
10149 rtx insn, t;
10150 bool pic_reg_used;
10151 struct ix86_frame frame;
10152 HOST_WIDE_INT allocate;
10153 bool int_registers_saved;
10154 bool sse_registers_saved;
10156 ix86_finalize_stack_realign_flags ();
10158 /* DRAP should not coexist with stack_realign_fp */
10159 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
10161 memset (&m->fs, 0, sizeof (m->fs));
10163 /* Initialize CFA state for before the prologue. */
10164 m->fs.cfa_reg = stack_pointer_rtx;
10165 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
10167 /* Track SP offset to the CFA. We continue tracking this after we've
10168 swapped the CFA register away from SP. In the case of re-alignment
10169 this is fudged; we're interested to offsets within the local frame. */
10170 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10171 m->fs.sp_valid = true;
10173 ix86_compute_frame_layout (&frame);
10175 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
10177 /* We should have already generated an error for any use of
10178 ms_hook on a nested function. */
10179 gcc_checking_assert (!ix86_static_chain_on_stack);
10181 /* Check if profiling is active and we shall use profiling before
10182 prologue variant. If so sorry. */
10183 if (crtl->profile && flag_fentry != 0)
10184 sorry ("ms_hook_prologue attribute isn%'t compatible "
10185 "with -mfentry for 32-bit");
10187 /* In ix86_asm_output_function_label we emitted:
10188 8b ff movl.s %edi,%edi
10189 55 push %ebp
10190 8b ec movl.s %esp,%ebp
10192 This matches the hookable function prologue in Win32 API
10193 functions in Microsoft Windows XP Service Pack 2 and newer.
10194 Wine uses this to enable Windows apps to hook the Win32 API
10195 functions provided by Wine.
10197 What that means is that we've already set up the frame pointer. */
10199 if (frame_pointer_needed
10200 && !(crtl->drap_reg && crtl->stack_realign_needed))
10202 rtx push, mov;
10204 /* We've decided to use the frame pointer already set up.
10205 Describe this to the unwinder by pretending that both
10206 push and mov insns happen right here.
10208 Putting the unwind info here at the end of the ms_hook
10209 is done so that we can make absolutely certain we get
10210 the required byte sequence at the start of the function,
10211 rather than relying on an assembler that can produce
10212 the exact encoding required.
10214 However it does mean (in the unpatched case) that we have
10215 a 1 insn window where the asynchronous unwind info is
10216 incorrect. However, if we placed the unwind info at
10217 its correct location we would have incorrect unwind info
10218 in the patched case. Which is probably all moot since
10219 I don't expect Wine generates dwarf2 unwind info for the
10220 system libraries that use this feature. */
10222 insn = emit_insn (gen_blockage ());
10224 push = gen_push (hard_frame_pointer_rtx);
10225 mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
10226 stack_pointer_rtx);
10227 RTX_FRAME_RELATED_P (push) = 1;
10228 RTX_FRAME_RELATED_P (mov) = 1;
10230 RTX_FRAME_RELATED_P (insn) = 1;
10231 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10232 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
10234 /* Note that gen_push incremented m->fs.cfa_offset, even
10235 though we didn't emit the push insn here. */
10236 m->fs.cfa_reg = hard_frame_pointer_rtx;
10237 m->fs.fp_offset = m->fs.cfa_offset;
10238 m->fs.fp_valid = true;
10240 else
10242 /* The frame pointer is not needed so pop %ebp again.
10243 This leaves us with a pristine state. */
10244 emit_insn (gen_pop (hard_frame_pointer_rtx));
10248 /* The first insn of a function that accepts its static chain on the
10249 stack is to push the register that would be filled in by a direct
10250 call. This insn will be skipped by the trampoline. */
10251 else if (ix86_static_chain_on_stack)
10253 insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
10254 emit_insn (gen_blockage ());
10256 /* We don't want to interpret this push insn as a register save,
10257 only as a stack adjustment. The real copy of the register as
10258 a save will be done later, if needed. */
10259 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
10260 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
10261 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10262 RTX_FRAME_RELATED_P (insn) = 1;
10265 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10266 of DRAP is needed and stack realignment is really needed after reload */
10267 if (stack_realign_drap)
10269 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10271 /* Only need to push parameter pointer reg if it is caller saved. */
10272 if (!call_used_regs[REGNO (crtl->drap_reg)])
10274 /* Push arg pointer reg */
10275 insn = emit_insn (gen_push (crtl->drap_reg));
10276 RTX_FRAME_RELATED_P (insn) = 1;
10279 /* Grab the argument pointer. */
10280 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
10281 insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10282 RTX_FRAME_RELATED_P (insn) = 1;
10283 m->fs.cfa_reg = crtl->drap_reg;
10284 m->fs.cfa_offset = 0;
10286 /* Align the stack. */
10287 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10288 stack_pointer_rtx,
10289 GEN_INT (-align_bytes)));
10290 RTX_FRAME_RELATED_P (insn) = 1;
10292 /* Replicate the return address on the stack so that return
10293 address can be reached via (argp - 1) slot. This is needed
10294 to implement macro RETURN_ADDR_RTX and intrinsic function
10295 expand_builtin_return_addr etc. */
10296 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
10297 t = gen_frame_mem (word_mode, t);
10298 insn = emit_insn (gen_push (t));
10299 RTX_FRAME_RELATED_P (insn) = 1;
10301 /* For the purposes of frame and register save area addressing,
10302 we've started over with a new frame. */
10303 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10304 m->fs.realigned = true;
10307 int_registers_saved = (frame.nregs == 0);
10308 sse_registers_saved = (frame.nsseregs == 0);
10310 if (frame_pointer_needed && !m->fs.fp_valid)
10312 /* Note: AT&T enter does NOT have reversed args. Enter is probably
10313 slower on all targets. Also sdb doesn't like it. */
10314 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10315 RTX_FRAME_RELATED_P (insn) = 1;
10317 /* Push registers now, before setting the frame pointer
10318 on SEH target. */
10319 if (!int_registers_saved
10320 && TARGET_SEH
10321 && !frame.save_regs_using_mov)
10323 ix86_emit_save_regs ();
10324 int_registers_saved = true;
10325 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10328 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10330 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10331 RTX_FRAME_RELATED_P (insn) = 1;
10333 if (m->fs.cfa_reg == stack_pointer_rtx)
10334 m->fs.cfa_reg = hard_frame_pointer_rtx;
10335 m->fs.fp_offset = m->fs.sp_offset;
10336 m->fs.fp_valid = true;
10340 if (!int_registers_saved)
10342 /* If saving registers via PUSH, do so now. */
10343 if (!frame.save_regs_using_mov)
10345 ix86_emit_save_regs ();
10346 int_registers_saved = true;
10347 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10350 /* When using red zone we may start register saving before allocating
10351 the stack frame saving one cycle of the prologue. However, avoid
10352 doing this if we have to probe the stack; at least on x86_64 the
10353 stack probe can turn into a call that clobbers a red zone location. */
10354 else if (ix86_using_red_zone ()
10355 && (! TARGET_STACK_PROBE
10356 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10358 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10359 int_registers_saved = true;
10363 if (stack_realign_fp)
10365 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10366 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10368 /* The computation of the size of the re-aligned stack frame means
10369 that we must allocate the size of the register save area before
10370 performing the actual alignment. Otherwise we cannot guarantee
10371 that there's enough storage above the realignment point. */
10372 if (m->fs.sp_offset != frame.sse_reg_save_offset)
10373 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10374 GEN_INT (m->fs.sp_offset
10375 - frame.sse_reg_save_offset),
10376 -1, false);
10378 /* Align the stack. */
10379 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10380 stack_pointer_rtx,
10381 GEN_INT (-align_bytes)));
10383 /* For the purposes of register save area addressing, the stack
10384 pointer is no longer valid. As for the value of sp_offset,
10385 see ix86_compute_frame_layout, which we need to match in order
10386 to pass verification of stack_pointer_offset at the end. */
10387 m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10388 m->fs.sp_valid = false;
10391 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10393 if (flag_stack_usage_info)
10395 /* We start to count from ARG_POINTER. */
10396 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10398 /* If it was realigned, take into account the fake frame. */
10399 if (stack_realign_drap)
10401 if (ix86_static_chain_on_stack)
10402 stack_size += UNITS_PER_WORD;
10404 if (!call_used_regs[REGNO (crtl->drap_reg)])
10405 stack_size += UNITS_PER_WORD;
10407 /* This over-estimates by 1 minimal-stack-alignment-unit but
10408 mitigates that by counting in the new return address slot. */
10409 current_function_dynamic_stack_size
10410 += crtl->stack_alignment_needed / BITS_PER_UNIT;
10413 current_function_static_stack_size = stack_size;
10416 /* On SEH target with very large frame size, allocate an area to save
10417 SSE registers (as the very large allocation won't be described). */
10418 if (TARGET_SEH
10419 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
10420 && !sse_registers_saved)
10422 HOST_WIDE_INT sse_size =
10423 frame.sse_reg_save_offset - frame.reg_save_offset;
10425 gcc_assert (int_registers_saved);
10427 /* No need to do stack checking as the area will be immediately
10428 written. */
10429 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10430 GEN_INT (-sse_size), -1,
10431 m->fs.cfa_reg == stack_pointer_rtx);
10432 allocate -= sse_size;
10433 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10434 sse_registers_saved = true;
10437 /* The stack has already been decremented by the instruction calling us
10438 so probe if the size is non-negative to preserve the protection area. */
10439 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10441 /* We expect the registers to be saved when probes are used. */
10442 gcc_assert (int_registers_saved);
10444 if (STACK_CHECK_MOVING_SP)
10446 ix86_adjust_stack_and_probe (allocate);
10447 allocate = 0;
10449 else
10451 HOST_WIDE_INT size = allocate;
10453 if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10454 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10456 if (TARGET_STACK_PROBE)
10457 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10458 else
10459 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10463 if (allocate == 0)
10465 else if (!ix86_target_stack_probe ()
10466 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10468 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10469 GEN_INT (-allocate), -1,
10470 m->fs.cfa_reg == stack_pointer_rtx);
10472 else
10474 rtx eax = gen_rtx_REG (Pmode, AX_REG);
10475 rtx r10 = NULL;
10476 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10477 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
10478 bool eax_live = false;
10479 bool r10_live = false;
10481 if (TARGET_64BIT)
10482 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10483 if (!TARGET_64BIT_MS_ABI)
10484 eax_live = ix86_eax_live_at_start_p ();
10486 /* Note that SEH directives need to continue tracking the stack
10487 pointer even after the frame pointer has been set up. */
10488 if (eax_live)
10490 insn = emit_insn (gen_push (eax));
10491 allocate -= UNITS_PER_WORD;
10492 if (sp_is_cfa_reg || TARGET_SEH)
10494 if (sp_is_cfa_reg)
10495 m->fs.cfa_offset += UNITS_PER_WORD;
10496 RTX_FRAME_RELATED_P (insn) = 1;
10500 if (r10_live)
10502 r10 = gen_rtx_REG (Pmode, R10_REG);
10503 insn = emit_insn (gen_push (r10));
10504 allocate -= UNITS_PER_WORD;
10505 if (sp_is_cfa_reg || TARGET_SEH)
10507 if (sp_is_cfa_reg)
10508 m->fs.cfa_offset += UNITS_PER_WORD;
10509 RTX_FRAME_RELATED_P (insn) = 1;
10513 emit_move_insn (eax, GEN_INT (allocate));
10514 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10516 /* Use the fact that AX still contains ALLOCATE. */
10517 adjust_stack_insn = (Pmode == DImode
10518 ? gen_pro_epilogue_adjust_stack_di_sub
10519 : gen_pro_epilogue_adjust_stack_si_sub);
10521 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10522 stack_pointer_rtx, eax));
10524 if (sp_is_cfa_reg || TARGET_SEH)
10526 if (sp_is_cfa_reg)
10527 m->fs.cfa_offset += allocate;
10528 RTX_FRAME_RELATED_P (insn) = 1;
10529 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10530 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10531 plus_constant (Pmode, stack_pointer_rtx,
10532 -allocate)));
10534 m->fs.sp_offset += allocate;
10536 if (r10_live && eax_live)
10538 t = choose_baseaddr (m->fs.sp_offset - allocate);
10539 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
10540 gen_frame_mem (word_mode, t));
10541 t = choose_baseaddr (m->fs.sp_offset - allocate - UNITS_PER_WORD);
10542 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
10543 gen_frame_mem (word_mode, t));
10545 else if (eax_live || r10_live)
10547 t = choose_baseaddr (m->fs.sp_offset - allocate);
10548 emit_move_insn (gen_rtx_REG (word_mode,
10549 (eax_live ? AX_REG : R10_REG)),
10550 gen_frame_mem (word_mode, t));
10553 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10555 /* If we havn't already set up the frame pointer, do so now. */
10556 if (frame_pointer_needed && !m->fs.fp_valid)
10558 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10559 GEN_INT (frame.stack_pointer_offset
10560 - frame.hard_frame_pointer_offset));
10561 insn = emit_insn (insn);
10562 RTX_FRAME_RELATED_P (insn) = 1;
10563 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10565 if (m->fs.cfa_reg == stack_pointer_rtx)
10566 m->fs.cfa_reg = hard_frame_pointer_rtx;
10567 m->fs.fp_offset = frame.hard_frame_pointer_offset;
10568 m->fs.fp_valid = true;
10571 if (!int_registers_saved)
10572 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10573 if (!sse_registers_saved)
10574 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10576 pic_reg_used = false;
10577 if (pic_offset_table_rtx
10578 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10579 || crtl->profile))
10581 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
10583 if (alt_pic_reg_used != INVALID_REGNUM)
10584 SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
10586 pic_reg_used = true;
10589 if (pic_reg_used)
10591 if (TARGET_64BIT)
10593 if (ix86_cmodel == CM_LARGE_PIC)
10595 rtx label, tmp_reg;
10597 gcc_assert (Pmode == DImode);
10598 label = gen_label_rtx ();
10599 emit_label (label);
10600 LABEL_PRESERVE_P (label) = 1;
10601 tmp_reg = gen_rtx_REG (Pmode, R11_REG);
10602 gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
10603 insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
10604 label));
10605 insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
10606 insn = emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
10607 pic_offset_table_rtx, tmp_reg));
10609 else
10610 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
10612 else
10614 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
10615 RTX_FRAME_RELATED_P (insn) = 1;
10616 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
10620 /* In the pic_reg_used case, make sure that the got load isn't deleted
10621 when mcount needs it. Blockage to avoid call movement across mcount
10622 call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
10623 note. */
10624 if (crtl->profile && !flag_fentry && pic_reg_used)
10625 emit_insn (gen_prologue_use (pic_offset_table_rtx));
10627 if (crtl->drap_reg && !crtl->stack_realign_needed)
10629 /* vDRAP is setup but after reload it turns out stack realign
10630 isn't necessary, here we will emit prologue to setup DRAP
10631 without stack realign adjustment */
10632 t = choose_baseaddr (0);
10633 emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10636 /* Prevent instructions from being scheduled into register save push
10637 sequence when access to the redzone area is done through frame pointer.
10638 The offset between the frame pointer and the stack pointer is calculated
10639 relative to the value of the stack pointer at the end of the function
10640 prologue, and moving instructions that access redzone area via frame
10641 pointer inside push sequence violates this assumption. */
10642 if (frame_pointer_needed && frame.red_zone_size)
10643 emit_insn (gen_memory_blockage ());
10645 /* Emit cld instruction if stringops are used in the function. */
10646 if (TARGET_CLD && ix86_current_function_needs_cld)
10647 emit_insn (gen_cld ());
10649 /* SEH requires that the prologue end within 256 bytes of the start of
10650 the function. Prevent instruction schedules that would extend that.
10651 Further, prevent alloca modifications to the stack pointer from being
10652 combined with prologue modifications. */
10653 if (TARGET_SEH)
10654 emit_insn (gen_prologue_use (stack_pointer_rtx));
10657 /* Emit code to restore REG using a POP insn. */
10659 static void
10660 ix86_emit_restore_reg_using_pop (rtx reg)
10662 struct machine_function *m = cfun->machine;
10663 rtx insn = emit_insn (gen_pop (reg));
10665 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
10666 m->fs.sp_offset -= UNITS_PER_WORD;
10668 if (m->fs.cfa_reg == crtl->drap_reg
10669 && REGNO (reg) == REGNO (crtl->drap_reg))
10671 /* Previously we'd represented the CFA as an expression
10672 like *(%ebp - 8). We've just popped that value from
10673 the stack, which means we need to reset the CFA to
10674 the drap register. This will remain until we restore
10675 the stack pointer. */
10676 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10677 RTX_FRAME_RELATED_P (insn) = 1;
10679 /* This means that the DRAP register is valid for addressing too. */
10680 m->fs.drap_valid = true;
10681 return;
10684 if (m->fs.cfa_reg == stack_pointer_rtx)
10686 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
10687 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10688 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
10689 RTX_FRAME_RELATED_P (insn) = 1;
10691 m->fs.cfa_offset -= UNITS_PER_WORD;
10694 /* When the frame pointer is the CFA, and we pop it, we are
10695 swapping back to the stack pointer as the CFA. This happens
10696 for stack frames that don't allocate other data, so we assume
10697 the stack pointer is now pointing at the return address, i.e.
10698 the function entry state, which makes the offset be 1 word. */
10699 if (reg == hard_frame_pointer_rtx)
10701 m->fs.fp_valid = false;
10702 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10704 m->fs.cfa_reg = stack_pointer_rtx;
10705 m->fs.cfa_offset -= UNITS_PER_WORD;
10707 add_reg_note (insn, REG_CFA_DEF_CFA,
10708 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10709 GEN_INT (m->fs.cfa_offset)));
10710 RTX_FRAME_RELATED_P (insn) = 1;
10715 /* Emit code to restore saved registers using POP insns. */
10717 static void
10718 ix86_emit_restore_regs_using_pop (void)
10720 unsigned int regno;
10722 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10723 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
10724 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
10727 /* Emit code and notes for the LEAVE instruction. */
10729 static void
10730 ix86_emit_leave (void)
10732 struct machine_function *m = cfun->machine;
10733 rtx insn = emit_insn (ix86_gen_leave ());
10735 ix86_add_queued_cfa_restore_notes (insn);
10737 gcc_assert (m->fs.fp_valid);
10738 m->fs.sp_valid = true;
10739 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
10740 m->fs.fp_valid = false;
10742 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10744 m->fs.cfa_reg = stack_pointer_rtx;
10745 m->fs.cfa_offset = m->fs.sp_offset;
10747 add_reg_note (insn, REG_CFA_DEF_CFA,
10748 plus_constant (Pmode, stack_pointer_rtx,
10749 m->fs.sp_offset));
10750 RTX_FRAME_RELATED_P (insn) = 1;
10752 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
10753 m->fs.fp_offset);
10756 /* Emit code to restore saved registers using MOV insns.
10757 First register is restored from CFA - CFA_OFFSET. */
10758 static void
10759 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
10760 bool maybe_eh_return)
10762 struct machine_function *m = cfun->machine;
10763 unsigned int regno;
10765 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10766 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10768 rtx reg = gen_rtx_REG (word_mode, regno);
10769 rtx insn, mem;
10771 mem = choose_baseaddr (cfa_offset);
10772 mem = gen_frame_mem (word_mode, mem);
10773 insn = emit_move_insn (reg, mem);
10775 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
10777 /* Previously we'd represented the CFA as an expression
10778 like *(%ebp - 8). We've just popped that value from
10779 the stack, which means we need to reset the CFA to
10780 the drap register. This will remain until we restore
10781 the stack pointer. */
10782 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10783 RTX_FRAME_RELATED_P (insn) = 1;
10785 /* This means that the DRAP register is valid for addressing. */
10786 m->fs.drap_valid = true;
10788 else
10789 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10791 cfa_offset -= UNITS_PER_WORD;
10795 /* Emit code to restore saved registers using MOV insns.
10796 First register is restored from CFA - CFA_OFFSET. */
10797 static void
10798 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
10799 bool maybe_eh_return)
10801 unsigned int regno;
10803 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10804 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10806 rtx reg = gen_rtx_REG (V4SFmode, regno);
10807 rtx mem;
10809 mem = choose_baseaddr (cfa_offset);
10810 mem = gen_rtx_MEM (V4SFmode, mem);
10811 set_mem_align (mem, 128);
10812 emit_move_insn (reg, mem);
10814 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10816 cfa_offset -= 16;
10820 /* Restore function stack, frame, and registers. */
10822 void
10823 ix86_expand_epilogue (int style)
10825 struct machine_function *m = cfun->machine;
10826 struct machine_frame_state frame_state_save = m->fs;
10827 struct ix86_frame frame;
10828 bool restore_regs_via_mov;
10829 bool using_drap;
10831 ix86_finalize_stack_realign_flags ();
10832 ix86_compute_frame_layout (&frame);
10834 m->fs.sp_valid = (!frame_pointer_needed
10835 || (crtl->sp_is_unchanging
10836 && !stack_realign_fp));
10837 gcc_assert (!m->fs.sp_valid
10838 || m->fs.sp_offset == frame.stack_pointer_offset);
10840 /* The FP must be valid if the frame pointer is present. */
10841 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
10842 gcc_assert (!m->fs.fp_valid
10843 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
10845 /* We must have *some* valid pointer to the stack frame. */
10846 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
10848 /* The DRAP is never valid at this point. */
10849 gcc_assert (!m->fs.drap_valid);
10851 /* See the comment about red zone and frame
10852 pointer usage in ix86_expand_prologue. */
10853 if (frame_pointer_needed && frame.red_zone_size)
10854 emit_insn (gen_memory_blockage ());
10856 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
10857 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
10859 /* Determine the CFA offset of the end of the red-zone. */
10860 m->fs.red_zone_offset = 0;
10861 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
10863 /* The red-zone begins below the return address. */
10864 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
10866 /* When the register save area is in the aligned portion of
10867 the stack, determine the maximum runtime displacement that
10868 matches up with the aligned frame. */
10869 if (stack_realign_drap)
10870 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
10871 + UNITS_PER_WORD);
10874 /* Special care must be taken for the normal return case of a function
10875 using eh_return: the eax and edx registers are marked as saved, but
10876 not restored along this path. Adjust the save location to match. */
10877 if (crtl->calls_eh_return && style != 2)
10878 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
10880 /* EH_RETURN requires the use of moves to function properly. */
10881 if (crtl->calls_eh_return)
10882 restore_regs_via_mov = true;
10883 /* SEH requires the use of pops to identify the epilogue. */
10884 else if (TARGET_SEH)
10885 restore_regs_via_mov = false;
10886 /* If we're only restoring one register and sp is not valid then
10887 using a move instruction to restore the register since it's
10888 less work than reloading sp and popping the register. */
10889 else if (!m->fs.sp_valid && frame.nregs <= 1)
10890 restore_regs_via_mov = true;
10891 else if (TARGET_EPILOGUE_USING_MOVE
10892 && cfun->machine->use_fast_prologue_epilogue
10893 && (frame.nregs > 1
10894 || m->fs.sp_offset != frame.reg_save_offset))
10895 restore_regs_via_mov = true;
10896 else if (frame_pointer_needed
10897 && !frame.nregs
10898 && m->fs.sp_offset != frame.reg_save_offset)
10899 restore_regs_via_mov = true;
10900 else if (frame_pointer_needed
10901 && TARGET_USE_LEAVE
10902 && cfun->machine->use_fast_prologue_epilogue
10903 && frame.nregs == 1)
10904 restore_regs_via_mov = true;
10905 else
10906 restore_regs_via_mov = false;
10908 if (restore_regs_via_mov || frame.nsseregs)
10910 /* Ensure that the entire register save area is addressable via
10911 the stack pointer, if we will restore via sp. */
10912 if (TARGET_64BIT
10913 && m->fs.sp_offset > 0x7fffffff
10914 && !(m->fs.fp_valid || m->fs.drap_valid)
10915 && (frame.nsseregs + frame.nregs) != 0)
10917 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10918 GEN_INT (m->fs.sp_offset
10919 - frame.sse_reg_save_offset),
10920 style,
10921 m->fs.cfa_reg == stack_pointer_rtx);
10925 /* If there are any SSE registers to restore, then we have to do it
10926 via moves, since there's obviously no pop for SSE regs. */
10927 if (frame.nsseregs)
10928 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
10929 style == 2);
10931 if (restore_regs_via_mov)
10933 rtx t;
10935 if (frame.nregs)
10936 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
10938 /* eh_return epilogues need %ecx added to the stack pointer. */
10939 if (style == 2)
10941 rtx insn, sa = EH_RETURN_STACKADJ_RTX;
10943 /* Stack align doesn't work with eh_return. */
10944 gcc_assert (!stack_realign_drap);
10945 /* Neither does regparm nested functions. */
10946 gcc_assert (!ix86_static_chain_on_stack);
10948 if (frame_pointer_needed)
10950 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
10951 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
10952 emit_insn (gen_rtx_SET (VOIDmode, sa, t));
10954 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
10955 insn = emit_move_insn (hard_frame_pointer_rtx, t);
10957 /* Note that we use SA as a temporary CFA, as the return
10958 address is at the proper place relative to it. We
10959 pretend this happens at the FP restore insn because
10960 prior to this insn the FP would be stored at the wrong
10961 offset relative to SA, and after this insn we have no
10962 other reasonable register to use for the CFA. We don't
10963 bother resetting the CFA to the SP for the duration of
10964 the return insn. */
10965 add_reg_note (insn, REG_CFA_DEF_CFA,
10966 plus_constant (Pmode, sa, UNITS_PER_WORD));
10967 ix86_add_queued_cfa_restore_notes (insn);
10968 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
10969 RTX_FRAME_RELATED_P (insn) = 1;
10971 m->fs.cfa_reg = sa;
10972 m->fs.cfa_offset = UNITS_PER_WORD;
10973 m->fs.fp_valid = false;
10975 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
10976 const0_rtx, style, false);
10978 else
10980 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
10981 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
10982 insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
10983 ix86_add_queued_cfa_restore_notes (insn);
10985 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
10986 if (m->fs.cfa_offset != UNITS_PER_WORD)
10988 m->fs.cfa_offset = UNITS_PER_WORD;
10989 add_reg_note (insn, REG_CFA_DEF_CFA,
10990 plus_constant (Pmode, stack_pointer_rtx,
10991 UNITS_PER_WORD));
10992 RTX_FRAME_RELATED_P (insn) = 1;
10995 m->fs.sp_offset = UNITS_PER_WORD;
10996 m->fs.sp_valid = true;
10999 else
11001 /* SEH requires that the function end with (1) a stack adjustment
11002 if necessary, (2) a sequence of pops, and (3) a return or
11003 jump instruction. Prevent insns from the function body from
11004 being scheduled into this sequence. */
11005 if (TARGET_SEH)
11007 /* Prevent a catch region from being adjacent to the standard
11008 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
11009 several other flags that would be interesting to test are
11010 not yet set up. */
11011 if (flag_non_call_exceptions)
11012 emit_insn (gen_nops (const1_rtx));
11013 else
11014 emit_insn (gen_blockage ());
11017 /* First step is to deallocate the stack frame so that we can
11018 pop the registers. Also do it on SEH target for very large
11019 frame as the emitted instructions aren't allowed by the ABI in
11020 epilogues. */
11021 if (!m->fs.sp_valid
11022 || (TARGET_SEH
11023 && (m->fs.sp_offset - frame.reg_save_offset
11024 >= SEH_MAX_FRAME_SIZE)))
11026 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
11027 GEN_INT (m->fs.fp_offset
11028 - frame.reg_save_offset),
11029 style, false);
11031 else if (m->fs.sp_offset != frame.reg_save_offset)
11033 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11034 GEN_INT (m->fs.sp_offset
11035 - frame.reg_save_offset),
11036 style,
11037 m->fs.cfa_reg == stack_pointer_rtx);
11040 ix86_emit_restore_regs_using_pop ();
11043 /* If we used a stack pointer and haven't already got rid of it,
11044 then do so now. */
11045 if (m->fs.fp_valid)
11047 /* If the stack pointer is valid and pointing at the frame
11048 pointer store address, then we only need a pop. */
11049 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
11050 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11051 /* Leave results in shorter dependency chains on CPUs that are
11052 able to grok it fast. */
11053 else if (TARGET_USE_LEAVE
11054 || optimize_function_for_size_p (cfun)
11055 || !cfun->machine->use_fast_prologue_epilogue)
11056 ix86_emit_leave ();
11057 else
11059 pro_epilogue_adjust_stack (stack_pointer_rtx,
11060 hard_frame_pointer_rtx,
11061 const0_rtx, style, !using_drap);
11062 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11066 if (using_drap)
11068 int param_ptr_offset = UNITS_PER_WORD;
11069 rtx insn;
11071 gcc_assert (stack_realign_drap);
11073 if (ix86_static_chain_on_stack)
11074 param_ptr_offset += UNITS_PER_WORD;
11075 if (!call_used_regs[REGNO (crtl->drap_reg)])
11076 param_ptr_offset += UNITS_PER_WORD;
11078 insn = emit_insn (gen_rtx_SET
11079 (VOIDmode, stack_pointer_rtx,
11080 gen_rtx_PLUS (Pmode,
11081 crtl->drap_reg,
11082 GEN_INT (-param_ptr_offset))));
11083 m->fs.cfa_reg = stack_pointer_rtx;
11084 m->fs.cfa_offset = param_ptr_offset;
11085 m->fs.sp_offset = param_ptr_offset;
11086 m->fs.realigned = false;
11088 add_reg_note (insn, REG_CFA_DEF_CFA,
11089 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11090 GEN_INT (param_ptr_offset)));
11091 RTX_FRAME_RELATED_P (insn) = 1;
11093 if (!call_used_regs[REGNO (crtl->drap_reg)])
11094 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
11097 /* At this point the stack pointer must be valid, and we must have
11098 restored all of the registers. We may not have deallocated the
11099 entire stack frame. We've delayed this until now because it may
11100 be possible to merge the local stack deallocation with the
11101 deallocation forced by ix86_static_chain_on_stack. */
11102 gcc_assert (m->fs.sp_valid);
11103 gcc_assert (!m->fs.fp_valid);
11104 gcc_assert (!m->fs.realigned);
11105 if (m->fs.sp_offset != UNITS_PER_WORD)
11107 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11108 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
11109 style, true);
11111 else
11112 ix86_add_queued_cfa_restore_notes (get_last_insn ());
11114 /* Sibcall epilogues don't want a return instruction. */
11115 if (style == 0)
11117 m->fs = frame_state_save;
11118 return;
11121 if (crtl->args.pops_args && crtl->args.size)
11123 rtx popc = GEN_INT (crtl->args.pops_args);
11125 /* i386 can only pop 64K bytes. If asked to pop more, pop return
11126 address, do explicit add, and jump indirectly to the caller. */
11128 if (crtl->args.pops_args >= 65536)
11130 rtx ecx = gen_rtx_REG (SImode, CX_REG);
11131 rtx insn;
11133 /* There is no "pascal" calling convention in any 64bit ABI. */
11134 gcc_assert (!TARGET_64BIT);
11136 insn = emit_insn (gen_pop (ecx));
11137 m->fs.cfa_offset -= UNITS_PER_WORD;
11138 m->fs.sp_offset -= UNITS_PER_WORD;
11140 add_reg_note (insn, REG_CFA_ADJUST_CFA,
11141 copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
11142 add_reg_note (insn, REG_CFA_REGISTER,
11143 gen_rtx_SET (VOIDmode, ecx, pc_rtx));
11144 RTX_FRAME_RELATED_P (insn) = 1;
11146 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11147 popc, -1, true);
11148 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
11150 else
11151 emit_jump_insn (gen_simple_return_pop_internal (popc));
11153 else
11154 emit_jump_insn (gen_simple_return_internal ());
11156 /* Restore the state back to the state from the prologue,
11157 so that it's correct for the next epilogue. */
11158 m->fs = frame_state_save;
11161 /* Reset from the function's potential modifications. */
11163 static void
11164 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
11165 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
11167 if (pic_offset_table_rtx)
11168 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
11169 #if TARGET_MACHO
11170 /* Mach-O doesn't support labels at the end of objects, so if
11171 it looks like we might want one, insert a NOP. */
11173 rtx insn = get_last_insn ();
11174 rtx deleted_debug_label = NULL_RTX;
11175 while (insn
11176 && NOTE_P (insn)
11177 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
11179 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
11180 notes only, instead set their CODE_LABEL_NUMBER to -1,
11181 otherwise there would be code generation differences
11182 in between -g and -g0. */
11183 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11184 deleted_debug_label = insn;
11185 insn = PREV_INSN (insn);
11187 if (insn
11188 && (LABEL_P (insn)
11189 || (NOTE_P (insn)
11190 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
11191 fputs ("\tnop\n", file);
11192 else if (deleted_debug_label)
11193 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
11194 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11195 CODE_LABEL_NUMBER (insn) = -1;
11197 #endif
11201 /* Return a scratch register to use in the split stack prologue. The
11202 split stack prologue is used for -fsplit-stack. It is the first
11203 instructions in the function, even before the regular prologue.
11204 The scratch register can be any caller-saved register which is not
11205 used for parameters or for the static chain. */
11207 static unsigned int
11208 split_stack_prologue_scratch_regno (void)
11210 if (TARGET_64BIT)
11211 return R11_REG;
11212 else
11214 bool is_fastcall, is_thiscall;
11215 int regparm;
11217 is_fastcall = (lookup_attribute ("fastcall",
11218 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11219 != NULL);
11220 is_thiscall = (lookup_attribute ("thiscall",
11221 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11222 != NULL);
11223 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
11225 if (is_fastcall)
11227 if (DECL_STATIC_CHAIN (cfun->decl))
11229 sorry ("-fsplit-stack does not support fastcall with "
11230 "nested function");
11231 return INVALID_REGNUM;
11233 return AX_REG;
11235 else if (is_thiscall)
11237 if (!DECL_STATIC_CHAIN (cfun->decl))
11238 return DX_REG;
11239 return AX_REG;
11241 else if (regparm < 3)
11243 if (!DECL_STATIC_CHAIN (cfun->decl))
11244 return CX_REG;
11245 else
11247 if (regparm >= 2)
11249 sorry ("-fsplit-stack does not support 2 register "
11250 " parameters for a nested function");
11251 return INVALID_REGNUM;
11253 return DX_REG;
11256 else
11258 /* FIXME: We could make this work by pushing a register
11259 around the addition and comparison. */
11260 sorry ("-fsplit-stack does not support 3 register parameters");
11261 return INVALID_REGNUM;
11266 /* A SYMBOL_REF for the function which allocates new stackspace for
11267 -fsplit-stack. */
11269 static GTY(()) rtx split_stack_fn;
11271 /* A SYMBOL_REF for the more stack function when using the large
11272 model. */
11274 static GTY(()) rtx split_stack_fn_large;
11276 /* Handle -fsplit-stack. These are the first instructions in the
11277 function, even before the regular prologue. */
11279 void
11280 ix86_expand_split_stack_prologue (void)
11282 struct ix86_frame frame;
11283 HOST_WIDE_INT allocate;
11284 unsigned HOST_WIDE_INT args_size;
11285 rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
11286 rtx scratch_reg = NULL_RTX;
11287 rtx varargs_label = NULL_RTX;
11288 rtx fn;
11290 gcc_assert (flag_split_stack && reload_completed);
11292 ix86_finalize_stack_realign_flags ();
11293 ix86_compute_frame_layout (&frame);
11294 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
11296 /* This is the label we will branch to if we have enough stack
11297 space. We expect the basic block reordering pass to reverse this
11298 branch if optimizing, so that we branch in the unlikely case. */
11299 label = gen_label_rtx ();
11301 /* We need to compare the stack pointer minus the frame size with
11302 the stack boundary in the TCB. The stack boundary always gives
11303 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
11304 can compare directly. Otherwise we need to do an addition. */
11306 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
11307 UNSPEC_STACK_CHECK);
11308 limit = gen_rtx_CONST (Pmode, limit);
11309 limit = gen_rtx_MEM (Pmode, limit);
11310 if (allocate < SPLIT_STACK_AVAILABLE)
11311 current = stack_pointer_rtx;
11312 else
11314 unsigned int scratch_regno;
11315 rtx offset;
11317 /* We need a scratch register to hold the stack pointer minus
11318 the required frame size. Since this is the very start of the
11319 function, the scratch register can be any caller-saved
11320 register which is not used for parameters. */
11321 offset = GEN_INT (- allocate);
11322 scratch_regno = split_stack_prologue_scratch_regno ();
11323 if (scratch_regno == INVALID_REGNUM)
11324 return;
11325 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11326 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11328 /* We don't use ix86_gen_add3 in this case because it will
11329 want to split to lea, but when not optimizing the insn
11330 will not be split after this point. */
11331 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11332 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11333 offset)));
11335 else
11337 emit_move_insn (scratch_reg, offset);
11338 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
11339 stack_pointer_rtx));
11341 current = scratch_reg;
11344 ix86_expand_branch (GEU, current, limit, label);
11345 jump_insn = get_last_insn ();
11346 JUMP_LABEL (jump_insn) = label;
11348 /* Mark the jump as very likely to be taken. */
11349 add_reg_note (jump_insn, REG_BR_PROB,
11350 GEN_INT (REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100));
11352 if (split_stack_fn == NULL_RTX)
11353 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11354 fn = split_stack_fn;
11356 /* Get more stack space. We pass in the desired stack space and the
11357 size of the arguments to copy to the new stack. In 32-bit mode
11358 we push the parameters; __morestack will return on a new stack
11359 anyhow. In 64-bit mode we pass the parameters in r10 and
11360 r11. */
11361 allocate_rtx = GEN_INT (allocate);
11362 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11363 call_fusage = NULL_RTX;
11364 if (TARGET_64BIT)
11366 rtx reg10, reg11;
11368 reg10 = gen_rtx_REG (Pmode, R10_REG);
11369 reg11 = gen_rtx_REG (Pmode, R11_REG);
11371 /* If this function uses a static chain, it will be in %r10.
11372 Preserve it across the call to __morestack. */
11373 if (DECL_STATIC_CHAIN (cfun->decl))
11375 rtx rax;
11377 rax = gen_rtx_REG (word_mode, AX_REG);
11378 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
11379 use_reg (&call_fusage, rax);
11382 if (ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11384 HOST_WIDE_INT argval;
11386 gcc_assert (Pmode == DImode);
11387 /* When using the large model we need to load the address
11388 into a register, and we've run out of registers. So we
11389 switch to a different calling convention, and we call a
11390 different function: __morestack_large. We pass the
11391 argument size in the upper 32 bits of r10 and pass the
11392 frame size in the lower 32 bits. */
11393 gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11394 gcc_assert ((args_size & 0xffffffff) == args_size);
11396 if (split_stack_fn_large == NULL_RTX)
11397 split_stack_fn_large =
11398 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11400 if (ix86_cmodel == CM_LARGE_PIC)
11402 rtx label, x;
11404 label = gen_label_rtx ();
11405 emit_label (label);
11406 LABEL_PRESERVE_P (label) = 1;
11407 emit_insn (gen_set_rip_rex64 (reg10, label));
11408 emit_insn (gen_set_got_offset_rex64 (reg11, label));
11409 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
11410 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11411 UNSPEC_GOT);
11412 x = gen_rtx_CONST (Pmode, x);
11413 emit_move_insn (reg11, x);
11414 x = gen_rtx_PLUS (Pmode, reg10, reg11);
11415 x = gen_const_mem (Pmode, x);
11416 emit_move_insn (reg11, x);
11418 else
11419 emit_move_insn (reg11, split_stack_fn_large);
11421 fn = reg11;
11423 argval = ((args_size << 16) << 16) + allocate;
11424 emit_move_insn (reg10, GEN_INT (argval));
11426 else
11428 emit_move_insn (reg10, allocate_rtx);
11429 emit_move_insn (reg11, GEN_INT (args_size));
11430 use_reg (&call_fusage, reg11);
11433 use_reg (&call_fusage, reg10);
11435 else
11437 emit_insn (gen_push (GEN_INT (args_size)));
11438 emit_insn (gen_push (allocate_rtx));
11440 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11441 GEN_INT (UNITS_PER_WORD), constm1_rtx,
11442 NULL_RTX, false);
11443 add_function_usage_to (call_insn, call_fusage);
11445 /* In order to make call/return prediction work right, we now need
11446 to execute a return instruction. See
11447 libgcc/config/i386/morestack.S for the details on how this works.
11449 For flow purposes gcc must not see this as a return
11450 instruction--we need control flow to continue at the subsequent
11451 label. Therefore, we use an unspec. */
11452 gcc_assert (crtl->args.pops_args < 65536);
11453 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11455 /* If we are in 64-bit mode and this function uses a static chain,
11456 we saved %r10 in %rax before calling _morestack. */
11457 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11458 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
11459 gen_rtx_REG (word_mode, AX_REG));
11461 /* If this function calls va_start, we need to store a pointer to
11462 the arguments on the old stack, because they may not have been
11463 all copied to the new stack. At this point the old stack can be
11464 found at the frame pointer value used by __morestack, because
11465 __morestack has set that up before calling back to us. Here we
11466 store that pointer in a scratch register, and in
11467 ix86_expand_prologue we store the scratch register in a stack
11468 slot. */
11469 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11471 unsigned int scratch_regno;
11472 rtx frame_reg;
11473 int words;
11475 scratch_regno = split_stack_prologue_scratch_regno ();
11476 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11477 frame_reg = gen_rtx_REG (Pmode, BP_REG);
11479 /* 64-bit:
11480 fp -> old fp value
11481 return address within this function
11482 return address of caller of this function
11483 stack arguments
11484 So we add three words to get to the stack arguments.
11486 32-bit:
11487 fp -> old fp value
11488 return address within this function
11489 first argument to __morestack
11490 second argument to __morestack
11491 return address of caller of this function
11492 stack arguments
11493 So we add five words to get to the stack arguments.
11495 words = TARGET_64BIT ? 3 : 5;
11496 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11497 gen_rtx_PLUS (Pmode, frame_reg,
11498 GEN_INT (words * UNITS_PER_WORD))));
11500 varargs_label = gen_label_rtx ();
11501 emit_jump_insn (gen_jump (varargs_label));
11502 JUMP_LABEL (get_last_insn ()) = varargs_label;
11504 emit_barrier ();
11507 emit_label (label);
11508 LABEL_NUSES (label) = 1;
11510 /* If this function calls va_start, we now have to set the scratch
11511 register for the case where we do not call __morestack. In this
11512 case we need to set it based on the stack pointer. */
11513 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11515 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11516 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11517 GEN_INT (UNITS_PER_WORD))));
11519 emit_label (varargs_label);
11520 LABEL_NUSES (varargs_label) = 1;
11524 /* We may have to tell the dataflow pass that the split stack prologue
11525 is initializing a scratch register. */
11527 static void
11528 ix86_live_on_entry (bitmap regs)
11530 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11532 gcc_assert (flag_split_stack);
11533 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11537 /* Determine if op is suitable SUBREG RTX for address. */
11539 static bool
11540 ix86_address_subreg_operand (rtx op)
11542 enum machine_mode mode;
11544 if (!REG_P (op))
11545 return false;
11547 mode = GET_MODE (op);
11549 if (GET_MODE_CLASS (mode) != MODE_INT)
11550 return false;
11552 /* Don't allow SUBREGs that span more than a word. It can lead to spill
11553 failures when the register is one word out of a two word structure. */
11554 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
11555 return false;
11557 /* Allow only SUBREGs of non-eliminable hard registers. */
11558 return register_no_elim_operand (op, mode);
11561 /* Extract the parts of an RTL expression that is a valid memory address
11562 for an instruction. Return 0 if the structure of the address is
11563 grossly off. Return -1 if the address contains ASHIFT, so it is not
11564 strictly valid, but still used for computing length of lea instruction. */
11567 ix86_decompose_address (rtx addr, struct ix86_address *out)
11569 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11570 rtx base_reg, index_reg;
11571 HOST_WIDE_INT scale = 1;
11572 rtx scale_rtx = NULL_RTX;
11573 rtx tmp;
11574 int retval = 1;
11575 enum ix86_address_seg seg = SEG_DEFAULT;
11577 /* Allow zero-extended SImode addresses,
11578 they will be emitted with addr32 prefix. */
11579 if (TARGET_64BIT && GET_MODE (addr) == DImode)
11581 if (GET_CODE (addr) == ZERO_EXTEND
11582 && GET_MODE (XEXP (addr, 0)) == SImode)
11584 addr = XEXP (addr, 0);
11585 if (CONST_INT_P (addr))
11586 return 0;
11588 else if (GET_CODE (addr) == AND
11589 && const_32bit_mask (XEXP (addr, 1), DImode))
11591 addr = simplify_gen_subreg (SImode, XEXP (addr, 0), DImode, 0);
11592 if (addr == NULL_RTX)
11593 return 0;
11595 if (CONST_INT_P (addr))
11596 return 0;
11600 /* Allow SImode subregs of DImode addresses,
11601 they will be emitted with addr32 prefix. */
11602 if (TARGET_64BIT && GET_MODE (addr) == SImode)
11604 if (GET_CODE (addr) == SUBREG
11605 && GET_MODE (SUBREG_REG (addr)) == DImode)
11607 addr = SUBREG_REG (addr);
11608 if (CONST_INT_P (addr))
11609 return 0;
11613 if (REG_P (addr))
11614 base = addr;
11615 else if (GET_CODE (addr) == SUBREG)
11617 if (ix86_address_subreg_operand (SUBREG_REG (addr)))
11618 base = addr;
11619 else
11620 return 0;
11622 else if (GET_CODE (addr) == PLUS)
11624 rtx addends[4], op;
11625 int n = 0, i;
11627 op = addr;
11630 if (n >= 4)
11631 return 0;
11632 addends[n++] = XEXP (op, 1);
11633 op = XEXP (op, 0);
11635 while (GET_CODE (op) == PLUS);
11636 if (n >= 4)
11637 return 0;
11638 addends[n] = op;
11640 for (i = n; i >= 0; --i)
11642 op = addends[i];
11643 switch (GET_CODE (op))
11645 case MULT:
11646 if (index)
11647 return 0;
11648 index = XEXP (op, 0);
11649 scale_rtx = XEXP (op, 1);
11650 break;
11652 case ASHIFT:
11653 if (index)
11654 return 0;
11655 index = XEXP (op, 0);
11656 tmp = XEXP (op, 1);
11657 if (!CONST_INT_P (tmp))
11658 return 0;
11659 scale = INTVAL (tmp);
11660 if ((unsigned HOST_WIDE_INT) scale > 3)
11661 return 0;
11662 scale = 1 << scale;
11663 break;
11665 case ZERO_EXTEND:
11666 op = XEXP (op, 0);
11667 if (GET_CODE (op) != UNSPEC)
11668 return 0;
11669 /* FALLTHRU */
11671 case UNSPEC:
11672 if (XINT (op, 1) == UNSPEC_TP
11673 && TARGET_TLS_DIRECT_SEG_REFS
11674 && seg == SEG_DEFAULT)
11675 seg = TARGET_64BIT ? SEG_FS : SEG_GS;
11676 else
11677 return 0;
11678 break;
11680 case SUBREG:
11681 if (!ix86_address_subreg_operand (SUBREG_REG (op)))
11682 return 0;
11683 /* FALLTHRU */
11685 case REG:
11686 if (!base)
11687 base = op;
11688 else if (!index)
11689 index = op;
11690 else
11691 return 0;
11692 break;
11694 case CONST:
11695 case CONST_INT:
11696 case SYMBOL_REF:
11697 case LABEL_REF:
11698 if (disp)
11699 return 0;
11700 disp = op;
11701 break;
11703 default:
11704 return 0;
11708 else if (GET_CODE (addr) == MULT)
11710 index = XEXP (addr, 0); /* index*scale */
11711 scale_rtx = XEXP (addr, 1);
11713 else if (GET_CODE (addr) == ASHIFT)
11715 /* We're called for lea too, which implements ashift on occasion. */
11716 index = XEXP (addr, 0);
11717 tmp = XEXP (addr, 1);
11718 if (!CONST_INT_P (tmp))
11719 return 0;
11720 scale = INTVAL (tmp);
11721 if ((unsigned HOST_WIDE_INT) scale > 3)
11722 return 0;
11723 scale = 1 << scale;
11724 retval = -1;
11726 else if (CONST_INT_P (addr))
11728 if (!x86_64_immediate_operand (addr, VOIDmode))
11729 return 0;
11731 /* Constant addresses are sign extended to 64bit, we have to
11732 prevent addresses from 0x80000000 to 0xffffffff in x32 mode. */
11733 if (TARGET_X32
11734 && val_signbit_known_set_p (SImode, INTVAL (addr)))
11735 return 0;
11737 disp = addr;
11739 else
11740 disp = addr; /* displacement */
11742 if (index)
11744 if (REG_P (index))
11746 else if (GET_CODE (index) == SUBREG
11747 && ix86_address_subreg_operand (SUBREG_REG (index)))
11749 else
11750 return 0;
11753 /* Address override works only on the (%reg) part of %fs:(%reg). */
11754 if (seg != SEG_DEFAULT
11755 && ((base && GET_MODE (base) != word_mode)
11756 || (index && GET_MODE (index) != word_mode)))
11757 return 0;
11759 /* Extract the integral value of scale. */
11760 if (scale_rtx)
11762 if (!CONST_INT_P (scale_rtx))
11763 return 0;
11764 scale = INTVAL (scale_rtx);
11767 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
11768 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
11770 /* Avoid useless 0 displacement. */
11771 if (disp == const0_rtx && (base || index))
11772 disp = NULL_RTX;
11774 /* Allow arg pointer and stack pointer as index if there is not scaling. */
11775 if (base_reg && index_reg && scale == 1
11776 && (index_reg == arg_pointer_rtx
11777 || index_reg == frame_pointer_rtx
11778 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
11780 rtx tmp;
11781 tmp = base, base = index, index = tmp;
11782 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
11785 /* Special case: %ebp cannot be encoded as a base without a displacement.
11786 Similarly %r13. */
11787 if (!disp
11788 && base_reg
11789 && (base_reg == hard_frame_pointer_rtx
11790 || base_reg == frame_pointer_rtx
11791 || base_reg == arg_pointer_rtx
11792 || (REG_P (base_reg)
11793 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
11794 || REGNO (base_reg) == R13_REG))))
11795 disp = const0_rtx;
11797 /* Special case: on K6, [%esi] makes the instruction vector decoded.
11798 Avoid this by transforming to [%esi+0].
11799 Reload calls address legitimization without cfun defined, so we need
11800 to test cfun for being non-NULL. */
11801 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
11802 && base_reg && !index_reg && !disp
11803 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
11804 disp = const0_rtx;
11806 /* Special case: encode reg+reg instead of reg*2. */
11807 if (!base && index && scale == 2)
11808 base = index, base_reg = index_reg, scale = 1;
11810 /* Special case: scaling cannot be encoded without base or displacement. */
11811 if (!base && !disp && index && scale != 1)
11812 disp = const0_rtx;
11814 out->base = base;
11815 out->index = index;
11816 out->disp = disp;
11817 out->scale = scale;
11818 out->seg = seg;
11820 return retval;
11823 /* Return cost of the memory address x.
11824 For i386, it is better to use a complex address than let gcc copy
11825 the address into a reg and make a new pseudo. But not if the address
11826 requires to two regs - that would mean more pseudos with longer
11827 lifetimes. */
11828 static int
11829 ix86_address_cost (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED,
11830 addr_space_t as ATTRIBUTE_UNUSED,
11831 bool speed ATTRIBUTE_UNUSED)
11833 struct ix86_address parts;
11834 int cost = 1;
11835 int ok = ix86_decompose_address (x, &parts);
11837 gcc_assert (ok);
11839 if (parts.base && GET_CODE (parts.base) == SUBREG)
11840 parts.base = SUBREG_REG (parts.base);
11841 if (parts.index && GET_CODE (parts.index) == SUBREG)
11842 parts.index = SUBREG_REG (parts.index);
11844 /* Attempt to minimize number of registers in the address. */
11845 if ((parts.base
11846 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
11847 || (parts.index
11848 && (!REG_P (parts.index)
11849 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
11850 cost++;
11852 if (parts.base
11853 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
11854 && parts.index
11855 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
11856 && parts.base != parts.index)
11857 cost++;
11859 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
11860 since it's predecode logic can't detect the length of instructions
11861 and it degenerates to vector decoded. Increase cost of such
11862 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
11863 to split such addresses or even refuse such addresses at all.
11865 Following addressing modes are affected:
11866 [base+scale*index]
11867 [scale*index+disp]
11868 [base+index]
11870 The first and last case may be avoidable by explicitly coding the zero in
11871 memory address, but I don't have AMD-K6 machine handy to check this
11872 theory. */
11874 if (TARGET_K6
11875 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
11876 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
11877 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
11878 cost += 10;
11880 return cost;
11883 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
11884 this is used for to form addresses to local data when -fPIC is in
11885 use. */
11887 static bool
11888 darwin_local_data_pic (rtx disp)
11890 return (GET_CODE (disp) == UNSPEC
11891 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
11894 /* Determine if a given RTX is a valid constant. We already know this
11895 satisfies CONSTANT_P. */
11897 static bool
11898 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
11900 switch (GET_CODE (x))
11902 case CONST:
11903 x = XEXP (x, 0);
11905 if (GET_CODE (x) == PLUS)
11907 if (!CONST_INT_P (XEXP (x, 1)))
11908 return false;
11909 x = XEXP (x, 0);
11912 if (TARGET_MACHO && darwin_local_data_pic (x))
11913 return true;
11915 /* Only some unspecs are valid as "constants". */
11916 if (GET_CODE (x) == UNSPEC)
11917 switch (XINT (x, 1))
11919 case UNSPEC_GOT:
11920 case UNSPEC_GOTOFF:
11921 case UNSPEC_PLTOFF:
11922 return TARGET_64BIT;
11923 case UNSPEC_TPOFF:
11924 case UNSPEC_NTPOFF:
11925 x = XVECEXP (x, 0, 0);
11926 return (GET_CODE (x) == SYMBOL_REF
11927 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11928 case UNSPEC_DTPOFF:
11929 x = XVECEXP (x, 0, 0);
11930 return (GET_CODE (x) == SYMBOL_REF
11931 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
11932 default:
11933 return false;
11936 /* We must have drilled down to a symbol. */
11937 if (GET_CODE (x) == LABEL_REF)
11938 return true;
11939 if (GET_CODE (x) != SYMBOL_REF)
11940 return false;
11941 /* FALLTHRU */
11943 case SYMBOL_REF:
11944 /* TLS symbols are never valid. */
11945 if (SYMBOL_REF_TLS_MODEL (x))
11946 return false;
11948 /* DLLIMPORT symbols are never valid. */
11949 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
11950 && SYMBOL_REF_DLLIMPORT_P (x))
11951 return false;
11953 #if TARGET_MACHO
11954 /* mdynamic-no-pic */
11955 if (MACHO_DYNAMIC_NO_PIC_P)
11956 return machopic_symbol_defined_p (x);
11957 #endif
11958 break;
11960 case CONST_DOUBLE:
11961 if (GET_MODE (x) == TImode
11962 && x != CONST0_RTX (TImode)
11963 && !TARGET_64BIT)
11964 return false;
11965 break;
11967 case CONST_VECTOR:
11968 if (!standard_sse_constant_p (x))
11969 return false;
11971 default:
11972 break;
11975 /* Otherwise we handle everything else in the move patterns. */
11976 return true;
11979 /* Determine if it's legal to put X into the constant pool. This
11980 is not possible for the address of thread-local symbols, which
11981 is checked above. */
11983 static bool
11984 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
11986 /* We can always put integral constants and vectors in memory. */
11987 switch (GET_CODE (x))
11989 case CONST_INT:
11990 case CONST_DOUBLE:
11991 case CONST_VECTOR:
11992 return false;
11994 default:
11995 break;
11997 return !ix86_legitimate_constant_p (mode, x);
12001 /* Nonzero if the constant value X is a legitimate general operand
12002 when generating PIC code. It is given that flag_pic is on and
12003 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
12005 bool
12006 legitimate_pic_operand_p (rtx x)
12008 rtx inner;
12010 switch (GET_CODE (x))
12012 case CONST:
12013 inner = XEXP (x, 0);
12014 if (GET_CODE (inner) == PLUS
12015 && CONST_INT_P (XEXP (inner, 1)))
12016 inner = XEXP (inner, 0);
12018 /* Only some unspecs are valid as "constants". */
12019 if (GET_CODE (inner) == UNSPEC)
12020 switch (XINT (inner, 1))
12022 case UNSPEC_GOT:
12023 case UNSPEC_GOTOFF:
12024 case UNSPEC_PLTOFF:
12025 return TARGET_64BIT;
12026 case UNSPEC_TPOFF:
12027 x = XVECEXP (inner, 0, 0);
12028 return (GET_CODE (x) == SYMBOL_REF
12029 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
12030 case UNSPEC_MACHOPIC_OFFSET:
12031 return legitimate_pic_address_disp_p (x);
12032 default:
12033 return false;
12035 /* FALLTHRU */
12037 case SYMBOL_REF:
12038 case LABEL_REF:
12039 return legitimate_pic_address_disp_p (x);
12041 default:
12042 return true;
12046 /* Determine if a given CONST RTX is a valid memory displacement
12047 in PIC mode. */
12049 bool
12050 legitimate_pic_address_disp_p (rtx disp)
12052 bool saw_plus;
12054 /* In 64bit mode we can allow direct addresses of symbols and labels
12055 when they are not dynamic symbols. */
12056 if (TARGET_64BIT)
12058 rtx op0 = disp, op1;
12060 switch (GET_CODE (disp))
12062 case LABEL_REF:
12063 return true;
12065 case CONST:
12066 if (GET_CODE (XEXP (disp, 0)) != PLUS)
12067 break;
12068 op0 = XEXP (XEXP (disp, 0), 0);
12069 op1 = XEXP (XEXP (disp, 0), 1);
12070 if (!CONST_INT_P (op1)
12071 || INTVAL (op1) >= 16*1024*1024
12072 || INTVAL (op1) < -16*1024*1024)
12073 break;
12074 if (GET_CODE (op0) == LABEL_REF)
12075 return true;
12076 if (GET_CODE (op0) == CONST
12077 && GET_CODE (XEXP (op0, 0)) == UNSPEC
12078 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
12079 return true;
12080 if (GET_CODE (op0) == UNSPEC
12081 && XINT (op0, 1) == UNSPEC_PCREL)
12082 return true;
12083 if (GET_CODE (op0) != SYMBOL_REF)
12084 break;
12085 /* FALLTHRU */
12087 case SYMBOL_REF:
12088 /* TLS references should always be enclosed in UNSPEC. */
12089 if (SYMBOL_REF_TLS_MODEL (op0))
12090 return false;
12091 if (!SYMBOL_REF_FAR_ADDR_P (op0) && SYMBOL_REF_LOCAL_P (op0)
12092 && ix86_cmodel != CM_LARGE_PIC)
12093 return true;
12094 break;
12096 default:
12097 break;
12100 if (GET_CODE (disp) != CONST)
12101 return false;
12102 disp = XEXP (disp, 0);
12104 if (TARGET_64BIT)
12106 /* We are unsafe to allow PLUS expressions. This limit allowed distance
12107 of GOT tables. We should not need these anyway. */
12108 if (GET_CODE (disp) != UNSPEC
12109 || (XINT (disp, 1) != UNSPEC_GOTPCREL
12110 && XINT (disp, 1) != UNSPEC_GOTOFF
12111 && XINT (disp, 1) != UNSPEC_PCREL
12112 && XINT (disp, 1) != UNSPEC_PLTOFF))
12113 return false;
12115 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
12116 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
12117 return false;
12118 return true;
12121 saw_plus = false;
12122 if (GET_CODE (disp) == PLUS)
12124 if (!CONST_INT_P (XEXP (disp, 1)))
12125 return false;
12126 disp = XEXP (disp, 0);
12127 saw_plus = true;
12130 if (TARGET_MACHO && darwin_local_data_pic (disp))
12131 return true;
12133 if (GET_CODE (disp) != UNSPEC)
12134 return false;
12136 switch (XINT (disp, 1))
12138 case UNSPEC_GOT:
12139 if (saw_plus)
12140 return false;
12141 /* We need to check for both symbols and labels because VxWorks loads
12142 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
12143 details. */
12144 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12145 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
12146 case UNSPEC_GOTOFF:
12147 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
12148 While ABI specify also 32bit relocation but we don't produce it in
12149 small PIC model at all. */
12150 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12151 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
12152 && !TARGET_64BIT)
12153 return gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
12154 return false;
12155 case UNSPEC_GOTTPOFF:
12156 case UNSPEC_GOTNTPOFF:
12157 case UNSPEC_INDNTPOFF:
12158 if (saw_plus)
12159 return false;
12160 disp = XVECEXP (disp, 0, 0);
12161 return (GET_CODE (disp) == SYMBOL_REF
12162 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
12163 case UNSPEC_NTPOFF:
12164 disp = XVECEXP (disp, 0, 0);
12165 return (GET_CODE (disp) == SYMBOL_REF
12166 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
12167 case UNSPEC_DTPOFF:
12168 disp = XVECEXP (disp, 0, 0);
12169 return (GET_CODE (disp) == SYMBOL_REF
12170 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
12173 return false;
12176 /* Our implementation of LEGITIMIZE_RELOAD_ADDRESS. Returns a value to
12177 replace the input X, or the original X if no replacement is called for.
12178 The output parameter *WIN is 1 if the calling macro should goto WIN,
12179 0 if it should not. */
12181 bool
12182 ix86_legitimize_reload_address (rtx x,
12183 enum machine_mode mode ATTRIBUTE_UNUSED,
12184 int opnum, int type,
12185 int ind_levels ATTRIBUTE_UNUSED)
12187 /* Reload can generate:
12189 (plus:DI (plus:DI (unspec:DI [(const_int 0 [0])] UNSPEC_TP)
12190 (reg:DI 97))
12191 (reg:DI 2 cx))
12193 This RTX is rejected from ix86_legitimate_address_p due to
12194 non-strictness of base register 97. Following this rejection,
12195 reload pushes all three components into separate registers,
12196 creating invalid memory address RTX.
12198 Following code reloads only the invalid part of the
12199 memory address RTX. */
12201 if (GET_CODE (x) == PLUS
12202 && REG_P (XEXP (x, 1))
12203 && GET_CODE (XEXP (x, 0)) == PLUS
12204 && REG_P (XEXP (XEXP (x, 0), 1)))
12206 rtx base, index;
12207 bool something_reloaded = false;
12209 base = XEXP (XEXP (x, 0), 1);
12210 if (!REG_OK_FOR_BASE_STRICT_P (base))
12212 push_reload (base, NULL_RTX, &XEXP (XEXP (x, 0), 1), NULL,
12213 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12214 opnum, (enum reload_type) type);
12215 something_reloaded = true;
12218 index = XEXP (x, 1);
12219 if (!REG_OK_FOR_INDEX_STRICT_P (index))
12221 push_reload (index, NULL_RTX, &XEXP (x, 1), NULL,
12222 INDEX_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12223 opnum, (enum reload_type) type);
12224 something_reloaded = true;
12227 gcc_assert (something_reloaded);
12228 return true;
12231 return false;
12234 /* Recognizes RTL expressions that are valid memory addresses for an
12235 instruction. The MODE argument is the machine mode for the MEM
12236 expression that wants to use this address.
12238 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
12239 convert common non-canonical forms to canonical form so that they will
12240 be recognized. */
12242 static bool
12243 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
12244 rtx addr, bool strict)
12246 struct ix86_address parts;
12247 rtx base, index, disp;
12248 HOST_WIDE_INT scale;
12250 if (ix86_decompose_address (addr, &parts) <= 0)
12251 /* Decomposition failed. */
12252 return false;
12254 base = parts.base;
12255 index = parts.index;
12256 disp = parts.disp;
12257 scale = parts.scale;
12259 /* Validate base register. */
12260 if (base)
12262 rtx reg;
12264 if (REG_P (base))
12265 reg = base;
12266 else if (GET_CODE (base) == SUBREG && REG_P (SUBREG_REG (base)))
12267 reg = SUBREG_REG (base);
12268 else
12269 /* Base is not a register. */
12270 return false;
12272 if (GET_MODE (base) != SImode && GET_MODE (base) != DImode)
12273 return false;
12275 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
12276 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
12277 /* Base is not valid. */
12278 return false;
12281 /* Validate index register. */
12282 if (index)
12284 rtx reg;
12286 if (REG_P (index))
12287 reg = index;
12288 else if (GET_CODE (index) == SUBREG && REG_P (SUBREG_REG (index)))
12289 reg = SUBREG_REG (index);
12290 else
12291 /* Index is not a register. */
12292 return false;
12294 if (GET_MODE (index) != SImode && GET_MODE (index) != DImode)
12295 return false;
12297 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
12298 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
12299 /* Index is not valid. */
12300 return false;
12303 /* Index and base should have the same mode. */
12304 if (base && index
12305 && GET_MODE (base) != GET_MODE (index))
12306 return false;
12308 /* Validate scale factor. */
12309 if (scale != 1)
12311 if (!index)
12312 /* Scale without index. */
12313 return false;
12315 if (scale != 2 && scale != 4 && scale != 8)
12316 /* Scale is not a valid multiplier. */
12317 return false;
12320 /* Validate displacement. */
12321 if (disp)
12323 if (GET_CODE (disp) == CONST
12324 && GET_CODE (XEXP (disp, 0)) == UNSPEC
12325 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
12326 switch (XINT (XEXP (disp, 0), 1))
12328 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
12329 used. While ABI specify also 32bit relocations, we don't produce
12330 them at all and use IP relative instead. */
12331 case UNSPEC_GOT:
12332 case UNSPEC_GOTOFF:
12333 gcc_assert (flag_pic);
12334 if (!TARGET_64BIT)
12335 goto is_legitimate_pic;
12337 /* 64bit address unspec. */
12338 return false;
12340 case UNSPEC_GOTPCREL:
12341 case UNSPEC_PCREL:
12342 gcc_assert (flag_pic);
12343 goto is_legitimate_pic;
12345 case UNSPEC_GOTTPOFF:
12346 case UNSPEC_GOTNTPOFF:
12347 case UNSPEC_INDNTPOFF:
12348 case UNSPEC_NTPOFF:
12349 case UNSPEC_DTPOFF:
12350 break;
12352 case UNSPEC_STACK_CHECK:
12353 gcc_assert (flag_split_stack);
12354 break;
12356 default:
12357 /* Invalid address unspec. */
12358 return false;
12361 else if (SYMBOLIC_CONST (disp)
12362 && (flag_pic
12363 || (TARGET_MACHO
12364 #if TARGET_MACHO
12365 && MACHOPIC_INDIRECT
12366 && !machopic_operand_p (disp)
12367 #endif
12371 is_legitimate_pic:
12372 if (TARGET_64BIT && (index || base))
12374 /* foo@dtpoff(%rX) is ok. */
12375 if (GET_CODE (disp) != CONST
12376 || GET_CODE (XEXP (disp, 0)) != PLUS
12377 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
12378 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
12379 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
12380 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
12381 /* Non-constant pic memory reference. */
12382 return false;
12384 else if ((!TARGET_MACHO || flag_pic)
12385 && ! legitimate_pic_address_disp_p (disp))
12386 /* Displacement is an invalid pic construct. */
12387 return false;
12388 #if TARGET_MACHO
12389 else if (MACHO_DYNAMIC_NO_PIC_P
12390 && !ix86_legitimate_constant_p (Pmode, disp))
12391 /* displacment must be referenced via non_lazy_pointer */
12392 return false;
12393 #endif
12395 /* This code used to verify that a symbolic pic displacement
12396 includes the pic_offset_table_rtx register.
12398 While this is good idea, unfortunately these constructs may
12399 be created by "adds using lea" optimization for incorrect
12400 code like:
12402 int a;
12403 int foo(int i)
12405 return *(&a+i);
12408 This code is nonsensical, but results in addressing
12409 GOT table with pic_offset_table_rtx base. We can't
12410 just refuse it easily, since it gets matched by
12411 "addsi3" pattern, that later gets split to lea in the
12412 case output register differs from input. While this
12413 can be handled by separate addsi pattern for this case
12414 that never results in lea, this seems to be easier and
12415 correct fix for crash to disable this test. */
12417 else if (GET_CODE (disp) != LABEL_REF
12418 && !CONST_INT_P (disp)
12419 && (GET_CODE (disp) != CONST
12420 || !ix86_legitimate_constant_p (Pmode, disp))
12421 && (GET_CODE (disp) != SYMBOL_REF
12422 || !ix86_legitimate_constant_p (Pmode, disp)))
12423 /* Displacement is not constant. */
12424 return false;
12425 else if (TARGET_64BIT
12426 && !x86_64_immediate_operand (disp, VOIDmode))
12427 /* Displacement is out of range. */
12428 return false;
12431 /* Everything looks valid. */
12432 return true;
12435 /* Determine if a given RTX is a valid constant address. */
12437 bool
12438 constant_address_p (rtx x)
12440 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12443 /* Return a unique alias set for the GOT. */
12445 static alias_set_type
12446 ix86_GOT_alias_set (void)
12448 static alias_set_type set = -1;
12449 if (set == -1)
12450 set = new_alias_set ();
12451 return set;
12454 /* Return a legitimate reference for ORIG (an address) using the
12455 register REG. If REG is 0, a new pseudo is generated.
12457 There are two types of references that must be handled:
12459 1. Global data references must load the address from the GOT, via
12460 the PIC reg. An insn is emitted to do this load, and the reg is
12461 returned.
12463 2. Static data references, constant pool addresses, and code labels
12464 compute the address as an offset from the GOT, whose base is in
12465 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
12466 differentiate them from global data objects. The returned
12467 address is the PIC reg + an unspec constant.
12469 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12470 reg also appears in the address. */
12472 static rtx
12473 legitimize_pic_address (rtx orig, rtx reg)
12475 rtx addr = orig;
12476 rtx new_rtx = orig;
12478 #if TARGET_MACHO
12479 if (TARGET_MACHO && !TARGET_64BIT)
12481 if (reg == 0)
12482 reg = gen_reg_rtx (Pmode);
12483 /* Use the generic Mach-O PIC machinery. */
12484 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12486 #endif
12488 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12489 new_rtx = addr;
12490 else if (TARGET_64BIT
12491 && ix86_cmodel != CM_SMALL_PIC
12492 && gotoff_operand (addr, Pmode))
12494 rtx tmpreg;
12495 /* This symbol may be referenced via a displacement from the PIC
12496 base address (@GOTOFF). */
12498 if (reload_in_progress)
12499 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12500 if (GET_CODE (addr) == CONST)
12501 addr = XEXP (addr, 0);
12502 if (GET_CODE (addr) == PLUS)
12504 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12505 UNSPEC_GOTOFF);
12506 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12508 else
12509 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12510 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12511 if (!reg)
12512 tmpreg = gen_reg_rtx (Pmode);
12513 else
12514 tmpreg = reg;
12515 emit_move_insn (tmpreg, new_rtx);
12517 if (reg != 0)
12519 new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12520 tmpreg, 1, OPTAB_DIRECT);
12521 new_rtx = reg;
12523 else new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
12525 else if (!TARGET_64BIT && gotoff_operand (addr, Pmode))
12527 /* This symbol may be referenced via a displacement from the PIC
12528 base address (@GOTOFF). */
12530 if (reload_in_progress)
12531 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12532 if (GET_CODE (addr) == CONST)
12533 addr = XEXP (addr, 0);
12534 if (GET_CODE (addr) == PLUS)
12536 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12537 UNSPEC_GOTOFF);
12538 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12540 else
12541 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12542 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12543 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12545 if (reg != 0)
12547 emit_move_insn (reg, new_rtx);
12548 new_rtx = reg;
12551 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
12552 /* We can't use @GOTOFF for text labels on VxWorks;
12553 see gotoff_operand. */
12554 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
12556 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12558 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
12559 return legitimize_dllimport_symbol (addr, true);
12560 if (GET_CODE (addr) == CONST && GET_CODE (XEXP (addr, 0)) == PLUS
12561 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
12562 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
12564 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), true);
12565 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
12569 /* For x64 PE-COFF there is no GOT table. So we use address
12570 directly. */
12571 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
12573 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
12574 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12576 if (reg == 0)
12577 reg = gen_reg_rtx (Pmode);
12578 emit_move_insn (reg, new_rtx);
12579 new_rtx = reg;
12581 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
12583 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
12584 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12585 new_rtx = gen_const_mem (Pmode, new_rtx);
12586 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12588 if (reg == 0)
12589 reg = gen_reg_rtx (Pmode);
12590 /* Use directly gen_movsi, otherwise the address is loaded
12591 into register for CSE. We don't want to CSE this addresses,
12592 instead we CSE addresses from the GOT table, so skip this. */
12593 emit_insn (gen_movsi (reg, new_rtx));
12594 new_rtx = reg;
12596 else
12598 /* This symbol must be referenced via a load from the
12599 Global Offset Table (@GOT). */
12601 if (reload_in_progress)
12602 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12603 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
12604 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12605 if (TARGET_64BIT)
12606 new_rtx = force_reg (Pmode, new_rtx);
12607 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12608 new_rtx = gen_const_mem (Pmode, new_rtx);
12609 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12611 if (reg == 0)
12612 reg = gen_reg_rtx (Pmode);
12613 emit_move_insn (reg, new_rtx);
12614 new_rtx = reg;
12617 else
12619 if (CONST_INT_P (addr)
12620 && !x86_64_immediate_operand (addr, VOIDmode))
12622 if (reg)
12624 emit_move_insn (reg, addr);
12625 new_rtx = reg;
12627 else
12628 new_rtx = force_reg (Pmode, addr);
12630 else if (GET_CODE (addr) == CONST)
12632 addr = XEXP (addr, 0);
12634 /* We must match stuff we generate before. Assume the only
12635 unspecs that can get here are ours. Not that we could do
12636 anything with them anyway.... */
12637 if (GET_CODE (addr) == UNSPEC
12638 || (GET_CODE (addr) == PLUS
12639 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
12640 return orig;
12641 gcc_assert (GET_CODE (addr) == PLUS);
12643 if (GET_CODE (addr) == PLUS)
12645 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
12647 /* Check first to see if this is a constant offset from a @GOTOFF
12648 symbol reference. */
12649 if (gotoff_operand (op0, Pmode)
12650 && CONST_INT_P (op1))
12652 if (!TARGET_64BIT)
12654 if (reload_in_progress)
12655 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12656 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
12657 UNSPEC_GOTOFF);
12658 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
12659 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12660 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12662 if (reg != 0)
12664 emit_move_insn (reg, new_rtx);
12665 new_rtx = reg;
12668 else
12670 if (INTVAL (op1) < -16*1024*1024
12671 || INTVAL (op1) >= 16*1024*1024)
12673 if (!x86_64_immediate_operand (op1, Pmode))
12674 op1 = force_reg (Pmode, op1);
12675 new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
12679 else
12681 rtx base = legitimize_pic_address (op0, reg);
12682 enum machine_mode mode = GET_MODE (base);
12683 new_rtx
12684 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
12686 if (CONST_INT_P (new_rtx))
12688 if (INTVAL (new_rtx) < -16*1024*1024
12689 || INTVAL (new_rtx) >= 16*1024*1024)
12691 if (!x86_64_immediate_operand (new_rtx, mode))
12692 new_rtx = force_reg (mode, new_rtx);
12693 new_rtx
12694 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
12696 else
12697 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
12699 else
12701 if (GET_CODE (new_rtx) == PLUS
12702 && CONSTANT_P (XEXP (new_rtx, 1)))
12704 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
12705 new_rtx = XEXP (new_rtx, 1);
12707 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
12712 return new_rtx;
12715 /* Load the thread pointer. If TO_REG is true, force it into a register. */
12717 static rtx
12718 get_thread_pointer (enum machine_mode tp_mode, bool to_reg)
12720 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
12722 if (GET_MODE (tp) != tp_mode)
12724 gcc_assert (GET_MODE (tp) == SImode);
12725 gcc_assert (tp_mode == DImode);
12727 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
12730 if (to_reg)
12731 tp = copy_to_mode_reg (tp_mode, tp);
12733 return tp;
12736 /* Construct the SYMBOL_REF for the tls_get_addr function. */
12738 static GTY(()) rtx ix86_tls_symbol;
12740 static rtx
12741 ix86_tls_get_addr (void)
12743 if (!ix86_tls_symbol)
12745 const char *sym
12746 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
12747 ? "___tls_get_addr" : "__tls_get_addr");
12749 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
12752 return ix86_tls_symbol;
12755 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
12757 static GTY(()) rtx ix86_tls_module_base_symbol;
12760 ix86_tls_module_base (void)
12762 if (!ix86_tls_module_base_symbol)
12764 ix86_tls_module_base_symbol
12765 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
12767 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
12768 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
12771 return ix86_tls_module_base_symbol;
12774 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
12775 false if we expect this to be used for a memory address and true if
12776 we expect to load the address into a register. */
12778 static rtx
12779 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
12781 rtx dest, base, off;
12782 rtx pic = NULL_RTX, tp = NULL_RTX;
12783 enum machine_mode tp_mode = Pmode;
12784 int type;
12786 switch (model)
12788 case TLS_MODEL_GLOBAL_DYNAMIC:
12789 dest = gen_reg_rtx (Pmode);
12791 if (!TARGET_64BIT)
12793 if (flag_pic)
12794 pic = pic_offset_table_rtx;
12795 else
12797 pic = gen_reg_rtx (Pmode);
12798 emit_insn (gen_set_got (pic));
12802 if (TARGET_GNU2_TLS)
12804 if (TARGET_64BIT)
12805 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
12806 else
12807 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
12809 tp = get_thread_pointer (Pmode, true);
12810 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
12812 if (GET_MODE (x) != Pmode)
12813 x = gen_rtx_ZERO_EXTEND (Pmode, x);
12815 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12817 else
12819 rtx caddr = ix86_tls_get_addr ();
12821 if (TARGET_64BIT)
12823 rtx rax = gen_rtx_REG (Pmode, AX_REG);
12824 rtx insns;
12826 start_sequence ();
12827 emit_call_insn
12828 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
12829 insns = get_insns ();
12830 end_sequence ();
12832 if (GET_MODE (x) != Pmode)
12833 x = gen_rtx_ZERO_EXTEND (Pmode, x);
12835 RTL_CONST_CALL_P (insns) = 1;
12836 emit_libcall_block (insns, dest, rax, x);
12838 else
12839 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
12841 break;
12843 case TLS_MODEL_LOCAL_DYNAMIC:
12844 base = gen_reg_rtx (Pmode);
12846 if (!TARGET_64BIT)
12848 if (flag_pic)
12849 pic = pic_offset_table_rtx;
12850 else
12852 pic = gen_reg_rtx (Pmode);
12853 emit_insn (gen_set_got (pic));
12857 if (TARGET_GNU2_TLS)
12859 rtx tmp = ix86_tls_module_base ();
12861 if (TARGET_64BIT)
12862 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
12863 else
12864 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
12866 tp = get_thread_pointer (Pmode, true);
12867 set_unique_reg_note (get_last_insn (), REG_EQUAL,
12868 gen_rtx_MINUS (Pmode, tmp, tp));
12870 else
12872 rtx caddr = ix86_tls_get_addr ();
12874 if (TARGET_64BIT)
12876 rtx rax = gen_rtx_REG (Pmode, AX_REG);
12877 rtx insns, eqv;
12879 start_sequence ();
12880 emit_call_insn
12881 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
12882 insns = get_insns ();
12883 end_sequence ();
12885 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
12886 share the LD_BASE result with other LD model accesses. */
12887 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
12888 UNSPEC_TLS_LD_BASE);
12890 RTL_CONST_CALL_P (insns) = 1;
12891 emit_libcall_block (insns, base, rax, eqv);
12893 else
12894 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
12897 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
12898 off = gen_rtx_CONST (Pmode, off);
12900 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
12902 if (TARGET_GNU2_TLS)
12904 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
12906 if (GET_MODE (x) != Pmode)
12907 x = gen_rtx_ZERO_EXTEND (Pmode, x);
12909 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12911 break;
12913 case TLS_MODEL_INITIAL_EXEC:
12914 if (TARGET_64BIT)
12916 if (TARGET_SUN_TLS && !TARGET_X32)
12918 /* The Sun linker took the AMD64 TLS spec literally
12919 and can only handle %rax as destination of the
12920 initial executable code sequence. */
12922 dest = gen_reg_rtx (DImode);
12923 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
12924 return dest;
12927 /* Generate DImode references to avoid %fs:(%reg32)
12928 problems and linker IE->LE relaxation bug. */
12929 tp_mode = DImode;
12930 pic = NULL;
12931 type = UNSPEC_GOTNTPOFF;
12933 else if (flag_pic)
12935 if (reload_in_progress)
12936 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12937 pic = pic_offset_table_rtx;
12938 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
12940 else if (!TARGET_ANY_GNU_TLS)
12942 pic = gen_reg_rtx (Pmode);
12943 emit_insn (gen_set_got (pic));
12944 type = UNSPEC_GOTTPOFF;
12946 else
12948 pic = NULL;
12949 type = UNSPEC_INDNTPOFF;
12952 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
12953 off = gen_rtx_CONST (tp_mode, off);
12954 if (pic)
12955 off = gen_rtx_PLUS (tp_mode, pic, off);
12956 off = gen_const_mem (tp_mode, off);
12957 set_mem_alias_set (off, ix86_GOT_alias_set ());
12959 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12961 base = get_thread_pointer (tp_mode,
12962 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12963 off = force_reg (tp_mode, off);
12964 return gen_rtx_PLUS (tp_mode, base, off);
12966 else
12968 base = get_thread_pointer (Pmode, true);
12969 dest = gen_reg_rtx (Pmode);
12970 emit_insn (ix86_gen_sub3 (dest, base, off));
12972 break;
12974 case TLS_MODEL_LOCAL_EXEC:
12975 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
12976 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12977 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
12978 off = gen_rtx_CONST (Pmode, off);
12980 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12982 base = get_thread_pointer (Pmode,
12983 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12984 return gen_rtx_PLUS (Pmode, base, off);
12986 else
12988 base = get_thread_pointer (Pmode, true);
12989 dest = gen_reg_rtx (Pmode);
12990 emit_insn (ix86_gen_sub3 (dest, base, off));
12992 break;
12994 default:
12995 gcc_unreachable ();
12998 return dest;
13001 /* Create or return the unique __imp_DECL dllimport symbol corresponding
13002 to symbol DECL. */
13004 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
13005 htab_t dllimport_map;
13007 static tree
13008 get_dllimport_decl (tree decl)
13010 struct tree_map *h, in;
13011 void **loc;
13012 const char *name;
13013 const char *prefix;
13014 size_t namelen, prefixlen;
13015 char *imp_name;
13016 tree to;
13017 rtx rtl;
13019 if (!dllimport_map)
13020 dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
13022 in.hash = htab_hash_pointer (decl);
13023 in.base.from = decl;
13024 loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
13025 h = (struct tree_map *) *loc;
13026 if (h)
13027 return h->to;
13029 *loc = h = ggc_alloc_tree_map ();
13030 h->hash = in.hash;
13031 h->base.from = decl;
13032 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
13033 VAR_DECL, NULL, ptr_type_node);
13034 DECL_ARTIFICIAL (to) = 1;
13035 DECL_IGNORED_P (to) = 1;
13036 DECL_EXTERNAL (to) = 1;
13037 TREE_READONLY (to) = 1;
13039 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
13040 name = targetm.strip_name_encoding (name);
13041 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
13042 ? "*__imp_" : "*__imp__";
13043 namelen = strlen (name);
13044 prefixlen = strlen (prefix);
13045 imp_name = (char *) alloca (namelen + prefixlen + 1);
13046 memcpy (imp_name, prefix, prefixlen);
13047 memcpy (imp_name + prefixlen, name, namelen + 1);
13049 name = ggc_alloc_string (imp_name, namelen + prefixlen);
13050 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
13051 SET_SYMBOL_REF_DECL (rtl, to);
13052 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL;
13054 rtl = gen_const_mem (Pmode, rtl);
13055 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
13057 SET_DECL_RTL (to, rtl);
13058 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
13060 return to;
13063 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
13064 true if we require the result be a register. */
13066 static rtx
13067 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
13069 tree imp_decl;
13070 rtx x;
13072 gcc_assert (SYMBOL_REF_DECL (symbol));
13073 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol));
13075 x = DECL_RTL (imp_decl);
13076 if (want_reg)
13077 x = force_reg (Pmode, x);
13078 return x;
13081 /* Try machine-dependent ways of modifying an illegitimate address
13082 to be legitimate. If we find one, return the new, valid address.
13083 This macro is used in only one place: `memory_address' in explow.c.
13085 OLDX is the address as it was before break_out_memory_refs was called.
13086 In some cases it is useful to look at this to decide what needs to be done.
13088 It is always safe for this macro to do nothing. It exists to recognize
13089 opportunities to optimize the output.
13091 For the 80386, we handle X+REG by loading X into a register R and
13092 using R+REG. R will go in a general reg and indexing will be used.
13093 However, if REG is a broken-out memory address or multiplication,
13094 nothing needs to be done because REG can certainly go in a general reg.
13096 When -fpic is used, special handling is needed for symbolic references.
13097 See comments by legitimize_pic_address in i386.c for details. */
13099 static rtx
13100 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
13101 enum machine_mode mode)
13103 int changed = 0;
13104 unsigned log;
13106 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
13107 if (log)
13108 return legitimize_tls_address (x, (enum tls_model) log, false);
13109 if (GET_CODE (x) == CONST
13110 && GET_CODE (XEXP (x, 0)) == PLUS
13111 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13112 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
13114 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
13115 (enum tls_model) log, false);
13116 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13119 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
13121 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (x))
13122 return legitimize_dllimport_symbol (x, true);
13123 if (GET_CODE (x) == CONST
13124 && GET_CODE (XEXP (x, 0)) == PLUS
13125 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13126 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (x, 0), 0)))
13128 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (x, 0), 0), true);
13129 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13133 if (flag_pic && SYMBOLIC_CONST (x))
13134 return legitimize_pic_address (x, 0);
13136 #if TARGET_MACHO
13137 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
13138 return machopic_indirect_data_reference (x, 0);
13139 #endif
13141 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
13142 if (GET_CODE (x) == ASHIFT
13143 && CONST_INT_P (XEXP (x, 1))
13144 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
13146 changed = 1;
13147 log = INTVAL (XEXP (x, 1));
13148 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
13149 GEN_INT (1 << log));
13152 if (GET_CODE (x) == PLUS)
13154 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
13156 if (GET_CODE (XEXP (x, 0)) == ASHIFT
13157 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13158 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
13160 changed = 1;
13161 log = INTVAL (XEXP (XEXP (x, 0), 1));
13162 XEXP (x, 0) = gen_rtx_MULT (Pmode,
13163 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
13164 GEN_INT (1 << log));
13167 if (GET_CODE (XEXP (x, 1)) == ASHIFT
13168 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
13169 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
13171 changed = 1;
13172 log = INTVAL (XEXP (XEXP (x, 1), 1));
13173 XEXP (x, 1) = gen_rtx_MULT (Pmode,
13174 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
13175 GEN_INT (1 << log));
13178 /* Put multiply first if it isn't already. */
13179 if (GET_CODE (XEXP (x, 1)) == MULT)
13181 rtx tmp = XEXP (x, 0);
13182 XEXP (x, 0) = XEXP (x, 1);
13183 XEXP (x, 1) = tmp;
13184 changed = 1;
13187 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
13188 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
13189 created by virtual register instantiation, register elimination, and
13190 similar optimizations. */
13191 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
13193 changed = 1;
13194 x = gen_rtx_PLUS (Pmode,
13195 gen_rtx_PLUS (Pmode, XEXP (x, 0),
13196 XEXP (XEXP (x, 1), 0)),
13197 XEXP (XEXP (x, 1), 1));
13200 /* Canonicalize
13201 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
13202 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
13203 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
13204 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
13205 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
13206 && CONSTANT_P (XEXP (x, 1)))
13208 rtx constant;
13209 rtx other = NULL_RTX;
13211 if (CONST_INT_P (XEXP (x, 1)))
13213 constant = XEXP (x, 1);
13214 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
13216 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
13218 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
13219 other = XEXP (x, 1);
13221 else
13222 constant = 0;
13224 if (constant)
13226 changed = 1;
13227 x = gen_rtx_PLUS (Pmode,
13228 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
13229 XEXP (XEXP (XEXP (x, 0), 1), 0)),
13230 plus_constant (Pmode, other,
13231 INTVAL (constant)));
13235 if (changed && ix86_legitimate_address_p (mode, x, false))
13236 return x;
13238 if (GET_CODE (XEXP (x, 0)) == MULT)
13240 changed = 1;
13241 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
13244 if (GET_CODE (XEXP (x, 1)) == MULT)
13246 changed = 1;
13247 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
13250 if (changed
13251 && REG_P (XEXP (x, 1))
13252 && REG_P (XEXP (x, 0)))
13253 return x;
13255 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
13257 changed = 1;
13258 x = legitimize_pic_address (x, 0);
13261 if (changed && ix86_legitimate_address_p (mode, x, false))
13262 return x;
13264 if (REG_P (XEXP (x, 0)))
13266 rtx temp = gen_reg_rtx (Pmode);
13267 rtx val = force_operand (XEXP (x, 1), temp);
13268 if (val != temp)
13270 val = convert_to_mode (Pmode, val, 1);
13271 emit_move_insn (temp, val);
13274 XEXP (x, 1) = temp;
13275 return x;
13278 else if (REG_P (XEXP (x, 1)))
13280 rtx temp = gen_reg_rtx (Pmode);
13281 rtx val = force_operand (XEXP (x, 0), temp);
13282 if (val != temp)
13284 val = convert_to_mode (Pmode, val, 1);
13285 emit_move_insn (temp, val);
13288 XEXP (x, 0) = temp;
13289 return x;
13293 return x;
13296 /* Print an integer constant expression in assembler syntax. Addition
13297 and subtraction are the only arithmetic that may appear in these
13298 expressions. FILE is the stdio stream to write to, X is the rtx, and
13299 CODE is the operand print code from the output string. */
13301 static void
13302 output_pic_addr_const (FILE *file, rtx x, int code)
13304 char buf[256];
13306 switch (GET_CODE (x))
13308 case PC:
13309 gcc_assert (flag_pic);
13310 putc ('.', file);
13311 break;
13313 case SYMBOL_REF:
13314 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
13315 output_addr_const (file, x);
13316 else
13318 const char *name = XSTR (x, 0);
13320 /* Mark the decl as referenced so that cgraph will
13321 output the function. */
13322 if (SYMBOL_REF_DECL (x))
13323 mark_decl_referenced (SYMBOL_REF_DECL (x));
13325 #if TARGET_MACHO
13326 if (MACHOPIC_INDIRECT
13327 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
13328 name = machopic_indirection_name (x, /*stub_p=*/true);
13329 #endif
13330 assemble_name (file, name);
13332 if (!TARGET_MACHO && !(TARGET_64BIT && DEFAULT_ABI == MS_ABI)
13333 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
13334 fputs ("@PLT", file);
13335 break;
13337 case LABEL_REF:
13338 x = XEXP (x, 0);
13339 /* FALLTHRU */
13340 case CODE_LABEL:
13341 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
13342 assemble_name (asm_out_file, buf);
13343 break;
13345 case CONST_INT:
13346 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
13347 break;
13349 case CONST:
13350 /* This used to output parentheses around the expression,
13351 but that does not work on the 386 (either ATT or BSD assembler). */
13352 output_pic_addr_const (file, XEXP (x, 0), code);
13353 break;
13355 case CONST_DOUBLE:
13356 if (GET_MODE (x) == VOIDmode)
13358 /* We can use %d if the number is <32 bits and positive. */
13359 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
13360 fprintf (file, "0x%lx%08lx",
13361 (unsigned long) CONST_DOUBLE_HIGH (x),
13362 (unsigned long) CONST_DOUBLE_LOW (x));
13363 else
13364 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
13366 else
13367 /* We can't handle floating point constants;
13368 TARGET_PRINT_OPERAND must handle them. */
13369 output_operand_lossage ("floating constant misused");
13370 break;
13372 case PLUS:
13373 /* Some assemblers need integer constants to appear first. */
13374 if (CONST_INT_P (XEXP (x, 0)))
13376 output_pic_addr_const (file, XEXP (x, 0), code);
13377 putc ('+', file);
13378 output_pic_addr_const (file, XEXP (x, 1), code);
13380 else
13382 gcc_assert (CONST_INT_P (XEXP (x, 1)));
13383 output_pic_addr_const (file, XEXP (x, 1), code);
13384 putc ('+', file);
13385 output_pic_addr_const (file, XEXP (x, 0), code);
13387 break;
13389 case MINUS:
13390 if (!TARGET_MACHO)
13391 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
13392 output_pic_addr_const (file, XEXP (x, 0), code);
13393 putc ('-', file);
13394 output_pic_addr_const (file, XEXP (x, 1), code);
13395 if (!TARGET_MACHO)
13396 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
13397 break;
13399 case UNSPEC:
13400 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
13402 bool f = i386_asm_output_addr_const_extra (file, x);
13403 gcc_assert (f);
13404 break;
13407 gcc_assert (XVECLEN (x, 0) == 1);
13408 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
13409 switch (XINT (x, 1))
13411 case UNSPEC_GOT:
13412 fputs ("@GOT", file);
13413 break;
13414 case UNSPEC_GOTOFF:
13415 fputs ("@GOTOFF", file);
13416 break;
13417 case UNSPEC_PLTOFF:
13418 fputs ("@PLTOFF", file);
13419 break;
13420 case UNSPEC_PCREL:
13421 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13422 "(%rip)" : "[rip]", file);
13423 break;
13424 case UNSPEC_GOTPCREL:
13425 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13426 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
13427 break;
13428 case UNSPEC_GOTTPOFF:
13429 /* FIXME: This might be @TPOFF in Sun ld too. */
13430 fputs ("@gottpoff", file);
13431 break;
13432 case UNSPEC_TPOFF:
13433 fputs ("@tpoff", file);
13434 break;
13435 case UNSPEC_NTPOFF:
13436 if (TARGET_64BIT)
13437 fputs ("@tpoff", file);
13438 else
13439 fputs ("@ntpoff", file);
13440 break;
13441 case UNSPEC_DTPOFF:
13442 fputs ("@dtpoff", file);
13443 break;
13444 case UNSPEC_GOTNTPOFF:
13445 if (TARGET_64BIT)
13446 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13447 "@gottpoff(%rip)": "@gottpoff[rip]", file);
13448 else
13449 fputs ("@gotntpoff", file);
13450 break;
13451 case UNSPEC_INDNTPOFF:
13452 fputs ("@indntpoff", file);
13453 break;
13454 #if TARGET_MACHO
13455 case UNSPEC_MACHOPIC_OFFSET:
13456 putc ('-', file);
13457 machopic_output_function_base_name (file);
13458 break;
13459 #endif
13460 default:
13461 output_operand_lossage ("invalid UNSPEC as operand");
13462 break;
13464 break;
13466 default:
13467 output_operand_lossage ("invalid expression as operand");
13471 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
13472 We need to emit DTP-relative relocations. */
13474 static void ATTRIBUTE_UNUSED
13475 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
13477 fputs (ASM_LONG, file);
13478 output_addr_const (file, x);
13479 fputs ("@dtpoff", file);
13480 switch (size)
13482 case 4:
13483 break;
13484 case 8:
13485 fputs (", 0", file);
13486 break;
13487 default:
13488 gcc_unreachable ();
13492 /* Return true if X is a representation of the PIC register. This copes
13493 with calls from ix86_find_base_term, where the register might have
13494 been replaced by a cselib value. */
13496 static bool
13497 ix86_pic_register_p (rtx x)
13499 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
13500 return (pic_offset_table_rtx
13501 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
13502 else
13503 return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
13506 /* Helper function for ix86_delegitimize_address.
13507 Attempt to delegitimize TLS local-exec accesses. */
13509 static rtx
13510 ix86_delegitimize_tls_address (rtx orig_x)
13512 rtx x = orig_x, unspec;
13513 struct ix86_address addr;
13515 if (!TARGET_TLS_DIRECT_SEG_REFS)
13516 return orig_x;
13517 if (MEM_P (x))
13518 x = XEXP (x, 0);
13519 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
13520 return orig_x;
13521 if (ix86_decompose_address (x, &addr) == 0
13522 || addr.seg != (TARGET_64BIT ? SEG_FS : SEG_GS)
13523 || addr.disp == NULL_RTX
13524 || GET_CODE (addr.disp) != CONST)
13525 return orig_x;
13526 unspec = XEXP (addr.disp, 0);
13527 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
13528 unspec = XEXP (unspec, 0);
13529 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
13530 return orig_x;
13531 x = XVECEXP (unspec, 0, 0);
13532 gcc_assert (GET_CODE (x) == SYMBOL_REF);
13533 if (unspec != XEXP (addr.disp, 0))
13534 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
13535 if (addr.index)
13537 rtx idx = addr.index;
13538 if (addr.scale != 1)
13539 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
13540 x = gen_rtx_PLUS (Pmode, idx, x);
13542 if (addr.base)
13543 x = gen_rtx_PLUS (Pmode, addr.base, x);
13544 if (MEM_P (orig_x))
13545 x = replace_equiv_address_nv (orig_x, x);
13546 return x;
13549 /* In the name of slightly smaller debug output, and to cater to
13550 general assembler lossage, recognize PIC+GOTOFF and turn it back
13551 into a direct symbol reference.
13553 On Darwin, this is necessary to avoid a crash, because Darwin
13554 has a different PIC label for each routine but the DWARF debugging
13555 information is not associated with any particular routine, so it's
13556 necessary to remove references to the PIC label from RTL stored by
13557 the DWARF output code. */
13559 static rtx
13560 ix86_delegitimize_address (rtx x)
13562 rtx orig_x = delegitimize_mem_from_attrs (x);
13563 /* addend is NULL or some rtx if x is something+GOTOFF where
13564 something doesn't include the PIC register. */
13565 rtx addend = NULL_RTX;
13566 /* reg_addend is NULL or a multiple of some register. */
13567 rtx reg_addend = NULL_RTX;
13568 /* const_addend is NULL or a const_int. */
13569 rtx const_addend = NULL_RTX;
13570 /* This is the result, or NULL. */
13571 rtx result = NULL_RTX;
13573 x = orig_x;
13575 if (MEM_P (x))
13576 x = XEXP (x, 0);
13578 if (TARGET_64BIT)
13580 if (GET_CODE (x) == CONST
13581 && GET_CODE (XEXP (x, 0)) == PLUS
13582 && GET_MODE (XEXP (x, 0)) == Pmode
13583 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13584 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
13585 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
13587 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
13588 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
13589 if (MEM_P (orig_x))
13590 x = replace_equiv_address_nv (orig_x, x);
13591 return x;
13593 if (GET_CODE (x) != CONST
13594 || GET_CODE (XEXP (x, 0)) != UNSPEC
13595 || (XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL
13596 && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL)
13597 || (!MEM_P (orig_x) && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL))
13598 return ix86_delegitimize_tls_address (orig_x);
13599 x = XVECEXP (XEXP (x, 0), 0, 0);
13600 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
13602 x = simplify_gen_subreg (GET_MODE (orig_x), x,
13603 GET_MODE (x), 0);
13604 if (x == NULL_RTX)
13605 return orig_x;
13607 return x;
13610 if (GET_CODE (x) != PLUS
13611 || GET_CODE (XEXP (x, 1)) != CONST)
13612 return ix86_delegitimize_tls_address (orig_x);
13614 if (ix86_pic_register_p (XEXP (x, 0)))
13615 /* %ebx + GOT/GOTOFF */
13617 else if (GET_CODE (XEXP (x, 0)) == PLUS)
13619 /* %ebx + %reg * scale + GOT/GOTOFF */
13620 reg_addend = XEXP (x, 0);
13621 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
13622 reg_addend = XEXP (reg_addend, 1);
13623 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
13624 reg_addend = XEXP (reg_addend, 0);
13625 else
13627 reg_addend = NULL_RTX;
13628 addend = XEXP (x, 0);
13631 else
13632 addend = XEXP (x, 0);
13634 x = XEXP (XEXP (x, 1), 0);
13635 if (GET_CODE (x) == PLUS
13636 && CONST_INT_P (XEXP (x, 1)))
13638 const_addend = XEXP (x, 1);
13639 x = XEXP (x, 0);
13642 if (GET_CODE (x) == UNSPEC
13643 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
13644 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))))
13645 result = XVECEXP (x, 0, 0);
13647 if (TARGET_MACHO && darwin_local_data_pic (x)
13648 && !MEM_P (orig_x))
13649 result = XVECEXP (x, 0, 0);
13651 if (! result)
13652 return ix86_delegitimize_tls_address (orig_x);
13654 if (const_addend)
13655 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
13656 if (reg_addend)
13657 result = gen_rtx_PLUS (Pmode, reg_addend, result);
13658 if (addend)
13660 /* If the rest of original X doesn't involve the PIC register, add
13661 addend and subtract pic_offset_table_rtx. This can happen e.g.
13662 for code like:
13663 leal (%ebx, %ecx, 4), %ecx
13665 movl foo@GOTOFF(%ecx), %edx
13666 in which case we return (%ecx - %ebx) + foo. */
13667 if (pic_offset_table_rtx)
13668 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
13669 pic_offset_table_rtx),
13670 result);
13671 else
13672 return orig_x;
13674 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
13676 result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
13677 if (result == NULL_RTX)
13678 return orig_x;
13680 return result;
13683 /* If X is a machine specific address (i.e. a symbol or label being
13684 referenced as a displacement from the GOT implemented using an
13685 UNSPEC), then return the base term. Otherwise return X. */
13688 ix86_find_base_term (rtx x)
13690 rtx term;
13692 if (TARGET_64BIT)
13694 if (GET_CODE (x) != CONST)
13695 return x;
13696 term = XEXP (x, 0);
13697 if (GET_CODE (term) == PLUS
13698 && (CONST_INT_P (XEXP (term, 1))
13699 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
13700 term = XEXP (term, 0);
13701 if (GET_CODE (term) != UNSPEC
13702 || (XINT (term, 1) != UNSPEC_GOTPCREL
13703 && XINT (term, 1) != UNSPEC_PCREL))
13704 return x;
13706 return XVECEXP (term, 0, 0);
13709 return ix86_delegitimize_address (x);
13712 static void
13713 put_condition_code (enum rtx_code code, enum machine_mode mode, bool reverse,
13714 bool fp, FILE *file)
13716 const char *suffix;
13718 if (mode == CCFPmode || mode == CCFPUmode)
13720 code = ix86_fp_compare_code_to_integer (code);
13721 mode = CCmode;
13723 if (reverse)
13724 code = reverse_condition (code);
13726 switch (code)
13728 case EQ:
13729 switch (mode)
13731 case CCAmode:
13732 suffix = "a";
13733 break;
13735 case CCCmode:
13736 suffix = "c";
13737 break;
13739 case CCOmode:
13740 suffix = "o";
13741 break;
13743 case CCSmode:
13744 suffix = "s";
13745 break;
13747 default:
13748 suffix = "e";
13750 break;
13751 case NE:
13752 switch (mode)
13754 case CCAmode:
13755 suffix = "na";
13756 break;
13758 case CCCmode:
13759 suffix = "nc";
13760 break;
13762 case CCOmode:
13763 suffix = "no";
13764 break;
13766 case CCSmode:
13767 suffix = "ns";
13768 break;
13770 default:
13771 suffix = "ne";
13773 break;
13774 case GT:
13775 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
13776 suffix = "g";
13777 break;
13778 case GTU:
13779 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
13780 Those same assemblers have the same but opposite lossage on cmov. */
13781 if (mode == CCmode)
13782 suffix = fp ? "nbe" : "a";
13783 else if (mode == CCCmode)
13784 suffix = "b";
13785 else
13786 gcc_unreachable ();
13787 break;
13788 case LT:
13789 switch (mode)
13791 case CCNOmode:
13792 case CCGOCmode:
13793 suffix = "s";
13794 break;
13796 case CCmode:
13797 case CCGCmode:
13798 suffix = "l";
13799 break;
13801 default:
13802 gcc_unreachable ();
13804 break;
13805 case LTU:
13806 gcc_assert (mode == CCmode || mode == CCCmode);
13807 suffix = "b";
13808 break;
13809 case GE:
13810 switch (mode)
13812 case CCNOmode:
13813 case CCGOCmode:
13814 suffix = "ns";
13815 break;
13817 case CCmode:
13818 case CCGCmode:
13819 suffix = "ge";
13820 break;
13822 default:
13823 gcc_unreachable ();
13825 break;
13826 case GEU:
13827 /* ??? As above. */
13828 gcc_assert (mode == CCmode || mode == CCCmode);
13829 suffix = fp ? "nb" : "ae";
13830 break;
13831 case LE:
13832 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
13833 suffix = "le";
13834 break;
13835 case LEU:
13836 /* ??? As above. */
13837 if (mode == CCmode)
13838 suffix = "be";
13839 else if (mode == CCCmode)
13840 suffix = fp ? "nb" : "ae";
13841 else
13842 gcc_unreachable ();
13843 break;
13844 case UNORDERED:
13845 suffix = fp ? "u" : "p";
13846 break;
13847 case ORDERED:
13848 suffix = fp ? "nu" : "np";
13849 break;
13850 default:
13851 gcc_unreachable ();
13853 fputs (suffix, file);
13856 /* Print the name of register X to FILE based on its machine mode and number.
13857 If CODE is 'w', pretend the mode is HImode.
13858 If CODE is 'b', pretend the mode is QImode.
13859 If CODE is 'k', pretend the mode is SImode.
13860 If CODE is 'q', pretend the mode is DImode.
13861 If CODE is 'x', pretend the mode is V4SFmode.
13862 If CODE is 't', pretend the mode is V8SFmode.
13863 If CODE is 'h', pretend the reg is the 'high' byte register.
13864 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
13865 If CODE is 'd', duplicate the operand for AVX instruction.
13868 void
13869 print_reg (rtx x, int code, FILE *file)
13871 const char *reg;
13872 unsigned int regno;
13873 bool duplicated = code == 'd' && TARGET_AVX;
13875 if (ASSEMBLER_DIALECT == ASM_ATT)
13876 putc ('%', file);
13878 if (x == pc_rtx)
13880 gcc_assert (TARGET_64BIT);
13881 fputs ("rip", file);
13882 return;
13885 regno = true_regnum (x);
13886 gcc_assert (regno != ARG_POINTER_REGNUM
13887 && regno != FRAME_POINTER_REGNUM
13888 && regno != FLAGS_REG
13889 && regno != FPSR_REG
13890 && regno != FPCR_REG);
13892 if (code == 'w' || MMX_REG_P (x))
13893 code = 2;
13894 else if (code == 'b')
13895 code = 1;
13896 else if (code == 'k')
13897 code = 4;
13898 else if (code == 'q')
13899 code = 8;
13900 else if (code == 'y')
13901 code = 3;
13902 else if (code == 'h')
13903 code = 0;
13904 else if (code == 'x')
13905 code = 16;
13906 else if (code == 't')
13907 code = 32;
13908 else
13909 code = GET_MODE_SIZE (GET_MODE (x));
13911 /* Irritatingly, AMD extended registers use different naming convention
13912 from the normal registers: "r%d[bwd]" */
13913 if (REX_INT_REGNO_P (regno))
13915 gcc_assert (TARGET_64BIT);
13916 putc ('r', file);
13917 fprint_ul (file, regno - FIRST_REX_INT_REG + 8);
13918 switch (code)
13920 case 0:
13921 error ("extended registers have no high halves");
13922 break;
13923 case 1:
13924 putc ('b', file);
13925 break;
13926 case 2:
13927 putc ('w', file);
13928 break;
13929 case 4:
13930 putc ('d', file);
13931 break;
13932 case 8:
13933 /* no suffix */
13934 break;
13935 default:
13936 error ("unsupported operand size for extended register");
13937 break;
13939 return;
13942 reg = NULL;
13943 switch (code)
13945 case 3:
13946 if (STACK_TOP_P (x))
13948 reg = "st(0)";
13949 break;
13951 /* FALLTHRU */
13952 case 8:
13953 case 4:
13954 case 12:
13955 if (! ANY_FP_REG_P (x))
13956 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
13957 /* FALLTHRU */
13958 case 16:
13959 case 2:
13960 normal:
13961 reg = hi_reg_name[regno];
13962 break;
13963 case 1:
13964 if (regno >= ARRAY_SIZE (qi_reg_name))
13965 goto normal;
13966 reg = qi_reg_name[regno];
13967 break;
13968 case 0:
13969 if (regno >= ARRAY_SIZE (qi_high_reg_name))
13970 goto normal;
13971 reg = qi_high_reg_name[regno];
13972 break;
13973 case 32:
13974 if (SSE_REG_P (x))
13976 gcc_assert (!duplicated);
13977 putc ('y', file);
13978 fputs (hi_reg_name[regno] + 1, file);
13979 return;
13981 break;
13982 default:
13983 gcc_unreachable ();
13986 fputs (reg, file);
13987 if (duplicated)
13989 if (ASSEMBLER_DIALECT == ASM_ATT)
13990 fprintf (file, ", %%%s", reg);
13991 else
13992 fprintf (file, ", %s", reg);
13996 /* Locate some local-dynamic symbol still in use by this function
13997 so that we can print its name in some tls_local_dynamic_base
13998 pattern. */
14000 static int
14001 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
14003 rtx x = *px;
14005 if (GET_CODE (x) == SYMBOL_REF
14006 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
14008 cfun->machine->some_ld_name = XSTR (x, 0);
14009 return 1;
14012 return 0;
14015 static const char *
14016 get_some_local_dynamic_name (void)
14018 rtx insn;
14020 if (cfun->machine->some_ld_name)
14021 return cfun->machine->some_ld_name;
14023 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
14024 if (NONDEBUG_INSN_P (insn)
14025 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
14026 return cfun->machine->some_ld_name;
14028 return NULL;
14031 /* Meaning of CODE:
14032 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
14033 C -- print opcode suffix for set/cmov insn.
14034 c -- like C, but print reversed condition
14035 F,f -- likewise, but for floating-point.
14036 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
14037 otherwise nothing
14038 R -- print the prefix for register names.
14039 z -- print the opcode suffix for the size of the current operand.
14040 Z -- likewise, with special suffixes for x87 instructions.
14041 * -- print a star (in certain assembler syntax)
14042 A -- print an absolute memory reference.
14043 E -- print address with DImode register names if TARGET_64BIT.
14044 w -- print the operand as if it's a "word" (HImode) even if it isn't.
14045 s -- print a shift double count, followed by the assemblers argument
14046 delimiter.
14047 b -- print the QImode name of the register for the indicated operand.
14048 %b0 would print %al if operands[0] is reg 0.
14049 w -- likewise, print the HImode name of the register.
14050 k -- likewise, print the SImode name of the register.
14051 q -- likewise, print the DImode name of the register.
14052 x -- likewise, print the V4SFmode name of the register.
14053 t -- likewise, print the V8SFmode name of the register.
14054 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
14055 y -- print "st(0)" instead of "st" as a register.
14056 d -- print duplicated register operand for AVX instruction.
14057 D -- print condition for SSE cmp instruction.
14058 P -- if PIC, print an @PLT suffix.
14059 p -- print raw symbol name.
14060 X -- don't print any sort of PIC '@' suffix for a symbol.
14061 & -- print some in-use local-dynamic symbol name.
14062 H -- print a memory address offset by 8; used for sse high-parts
14063 Y -- print condition for XOP pcom* instruction.
14064 + -- print a branch hint as 'cs' or 'ds' prefix
14065 ; -- print a semicolon (after prefixes due to bug in older gas).
14066 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
14067 @ -- print a segment register of thread base pointer load
14068 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
14071 void
14072 ix86_print_operand (FILE *file, rtx x, int code)
14074 if (code)
14076 switch (code)
14078 case 'A':
14079 switch (ASSEMBLER_DIALECT)
14081 case ASM_ATT:
14082 putc ('*', file);
14083 break;
14085 case ASM_INTEL:
14086 /* Intel syntax. For absolute addresses, registers should not
14087 be surrounded by braces. */
14088 if (!REG_P (x))
14090 putc ('[', file);
14091 ix86_print_operand (file, x, 0);
14092 putc (']', file);
14093 return;
14095 break;
14097 default:
14098 gcc_unreachable ();
14101 ix86_print_operand (file, x, 0);
14102 return;
14104 case 'E':
14105 /* Wrap address in an UNSPEC to declare special handling. */
14106 if (TARGET_64BIT)
14107 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
14109 output_address (x);
14110 return;
14112 case 'L':
14113 if (ASSEMBLER_DIALECT == ASM_ATT)
14114 putc ('l', file);
14115 return;
14117 case 'W':
14118 if (ASSEMBLER_DIALECT == ASM_ATT)
14119 putc ('w', file);
14120 return;
14122 case 'B':
14123 if (ASSEMBLER_DIALECT == ASM_ATT)
14124 putc ('b', file);
14125 return;
14127 case 'Q':
14128 if (ASSEMBLER_DIALECT == ASM_ATT)
14129 putc ('l', file);
14130 return;
14132 case 'S':
14133 if (ASSEMBLER_DIALECT == ASM_ATT)
14134 putc ('s', file);
14135 return;
14137 case 'T':
14138 if (ASSEMBLER_DIALECT == ASM_ATT)
14139 putc ('t', file);
14140 return;
14142 case 'O':
14143 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14144 if (ASSEMBLER_DIALECT != ASM_ATT)
14145 return;
14147 switch (GET_MODE_SIZE (GET_MODE (x)))
14149 case 2:
14150 putc ('w', file);
14151 break;
14153 case 4:
14154 putc ('l', file);
14155 break;
14157 case 8:
14158 putc ('q', file);
14159 break;
14161 default:
14162 output_operand_lossage
14163 ("invalid operand size for operand code 'O'");
14164 return;
14167 putc ('.', file);
14168 #endif
14169 return;
14171 case 'z':
14172 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14174 /* Opcodes don't get size suffixes if using Intel opcodes. */
14175 if (ASSEMBLER_DIALECT == ASM_INTEL)
14176 return;
14178 switch (GET_MODE_SIZE (GET_MODE (x)))
14180 case 1:
14181 putc ('b', file);
14182 return;
14184 case 2:
14185 putc ('w', file);
14186 return;
14188 case 4:
14189 putc ('l', file);
14190 return;
14192 case 8:
14193 putc ('q', file);
14194 return;
14196 default:
14197 output_operand_lossage
14198 ("invalid operand size for operand code 'z'");
14199 return;
14203 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14204 warning
14205 (0, "non-integer operand used with operand code 'z'");
14206 /* FALLTHRU */
14208 case 'Z':
14209 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
14210 if (ASSEMBLER_DIALECT == ASM_INTEL)
14211 return;
14213 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14215 switch (GET_MODE_SIZE (GET_MODE (x)))
14217 case 2:
14218 #ifdef HAVE_AS_IX86_FILDS
14219 putc ('s', file);
14220 #endif
14221 return;
14223 case 4:
14224 putc ('l', file);
14225 return;
14227 case 8:
14228 #ifdef HAVE_AS_IX86_FILDQ
14229 putc ('q', file);
14230 #else
14231 fputs ("ll", file);
14232 #endif
14233 return;
14235 default:
14236 break;
14239 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14241 /* 387 opcodes don't get size suffixes
14242 if the operands are registers. */
14243 if (STACK_REG_P (x))
14244 return;
14246 switch (GET_MODE_SIZE (GET_MODE (x)))
14248 case 4:
14249 putc ('s', file);
14250 return;
14252 case 8:
14253 putc ('l', file);
14254 return;
14256 case 12:
14257 case 16:
14258 putc ('t', file);
14259 return;
14261 default:
14262 break;
14265 else
14267 output_operand_lossage
14268 ("invalid operand type used with operand code 'Z'");
14269 return;
14272 output_operand_lossage
14273 ("invalid operand size for operand code 'Z'");
14274 return;
14276 case 'd':
14277 case 'b':
14278 case 'w':
14279 case 'k':
14280 case 'q':
14281 case 'h':
14282 case 't':
14283 case 'y':
14284 case 'x':
14285 case 'X':
14286 case 'P':
14287 case 'p':
14288 break;
14290 case 's':
14291 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
14293 ix86_print_operand (file, x, 0);
14294 fputs (", ", file);
14296 return;
14298 case 'Y':
14299 switch (GET_CODE (x))
14301 case NE:
14302 fputs ("neq", file);
14303 break;
14304 case EQ:
14305 fputs ("eq", file);
14306 break;
14307 case GE:
14308 case GEU:
14309 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14310 break;
14311 case GT:
14312 case GTU:
14313 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14314 break;
14315 case LE:
14316 case LEU:
14317 fputs ("le", file);
14318 break;
14319 case LT:
14320 case LTU:
14321 fputs ("lt", file);
14322 break;
14323 case UNORDERED:
14324 fputs ("unord", file);
14325 break;
14326 case ORDERED:
14327 fputs ("ord", file);
14328 break;
14329 case UNEQ:
14330 fputs ("ueq", file);
14331 break;
14332 case UNGE:
14333 fputs ("nlt", file);
14334 break;
14335 case UNGT:
14336 fputs ("nle", file);
14337 break;
14338 case UNLE:
14339 fputs ("ule", file);
14340 break;
14341 case UNLT:
14342 fputs ("ult", file);
14343 break;
14344 case LTGT:
14345 fputs ("une", file);
14346 break;
14347 default:
14348 output_operand_lossage ("operand is not a condition code, "
14349 "invalid operand code 'Y'");
14350 return;
14352 return;
14354 case 'D':
14355 /* Little bit of braindamage here. The SSE compare instructions
14356 does use completely different names for the comparisons that the
14357 fp conditional moves. */
14358 switch (GET_CODE (x))
14360 case UNEQ:
14361 if (TARGET_AVX)
14363 fputs ("eq_us", file);
14364 break;
14366 case EQ:
14367 fputs ("eq", file);
14368 break;
14369 case UNLT:
14370 if (TARGET_AVX)
14372 fputs ("nge", file);
14373 break;
14375 case LT:
14376 fputs ("lt", file);
14377 break;
14378 case UNLE:
14379 if (TARGET_AVX)
14381 fputs ("ngt", file);
14382 break;
14384 case LE:
14385 fputs ("le", file);
14386 break;
14387 case UNORDERED:
14388 fputs ("unord", file);
14389 break;
14390 case LTGT:
14391 if (TARGET_AVX)
14393 fputs ("neq_oq", file);
14394 break;
14396 case NE:
14397 fputs ("neq", file);
14398 break;
14399 case GE:
14400 if (TARGET_AVX)
14402 fputs ("ge", file);
14403 break;
14405 case UNGE:
14406 fputs ("nlt", file);
14407 break;
14408 case GT:
14409 if (TARGET_AVX)
14411 fputs ("gt", file);
14412 break;
14414 case UNGT:
14415 fputs ("nle", file);
14416 break;
14417 case ORDERED:
14418 fputs ("ord", file);
14419 break;
14420 default:
14421 output_operand_lossage ("operand is not a condition code, "
14422 "invalid operand code 'D'");
14423 return;
14425 return;
14427 case 'F':
14428 case 'f':
14429 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14430 if (ASSEMBLER_DIALECT == ASM_ATT)
14431 putc ('.', file);
14432 #endif
14434 case 'C':
14435 case 'c':
14436 if (!COMPARISON_P (x))
14438 output_operand_lossage ("operand is not a condition code, "
14439 "invalid operand code '%c'", code);
14440 return;
14442 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
14443 code == 'c' || code == 'f',
14444 code == 'F' || code == 'f',
14445 file);
14446 return;
14448 case 'H':
14449 if (!offsettable_memref_p (x))
14451 output_operand_lossage ("operand is not an offsettable memory "
14452 "reference, invalid operand code 'H'");
14453 return;
14455 /* It doesn't actually matter what mode we use here, as we're
14456 only going to use this for printing. */
14457 x = adjust_address_nv (x, DImode, 8);
14458 break;
14460 case 'K':
14461 gcc_assert (CONST_INT_P (x));
14463 if (INTVAL (x) & IX86_HLE_ACQUIRE)
14464 #ifdef HAVE_AS_IX86_HLE
14465 fputs ("xacquire ", file);
14466 #else
14467 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
14468 #endif
14469 else if (INTVAL (x) & IX86_HLE_RELEASE)
14470 #ifdef HAVE_AS_IX86_HLE
14471 fputs ("xrelease ", file);
14472 #else
14473 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
14474 #endif
14475 /* We do not want to print value of the operand. */
14476 return;
14478 case '*':
14479 if (ASSEMBLER_DIALECT == ASM_ATT)
14480 putc ('*', file);
14481 return;
14483 case '&':
14485 const char *name = get_some_local_dynamic_name ();
14486 if (name == NULL)
14487 output_operand_lossage ("'%%&' used without any "
14488 "local dynamic TLS references");
14489 else
14490 assemble_name (file, name);
14491 return;
14494 case '+':
14496 rtx x;
14498 if (!optimize
14499 || optimize_function_for_size_p (cfun)
14500 || !TARGET_BRANCH_PREDICTION_HINTS)
14501 return;
14503 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
14504 if (x)
14506 int pred_val = INTVAL (XEXP (x, 0));
14508 if (pred_val < REG_BR_PROB_BASE * 45 / 100
14509 || pred_val > REG_BR_PROB_BASE * 55 / 100)
14511 bool taken = pred_val > REG_BR_PROB_BASE / 2;
14512 bool cputaken
14513 = final_forward_branch_p (current_output_insn) == 0;
14515 /* Emit hints only in the case default branch prediction
14516 heuristics would fail. */
14517 if (taken != cputaken)
14519 /* We use 3e (DS) prefix for taken branches and
14520 2e (CS) prefix for not taken branches. */
14521 if (taken)
14522 fputs ("ds ; ", file);
14523 else
14524 fputs ("cs ; ", file);
14528 return;
14531 case ';':
14532 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
14533 putc (';', file);
14534 #endif
14535 return;
14537 case '@':
14538 if (ASSEMBLER_DIALECT == ASM_ATT)
14539 putc ('%', file);
14541 /* The kernel uses a different segment register for performance
14542 reasons; a system call would not have to trash the userspace
14543 segment register, which would be expensive. */
14544 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
14545 fputs ("fs", file);
14546 else
14547 fputs ("gs", file);
14548 return;
14550 case '~':
14551 putc (TARGET_AVX2 ? 'i' : 'f', file);
14552 return;
14554 case '^':
14555 if (TARGET_64BIT && Pmode != word_mode)
14556 fputs ("addr32 ", file);
14557 return;
14559 default:
14560 output_operand_lossage ("invalid operand code '%c'", code);
14564 if (REG_P (x))
14565 print_reg (x, code, file);
14567 else if (MEM_P (x))
14569 /* No `byte ptr' prefix for call instructions or BLKmode operands. */
14570 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
14571 && GET_MODE (x) != BLKmode)
14573 const char * size;
14574 switch (GET_MODE_SIZE (GET_MODE (x)))
14576 case 1: size = "BYTE"; break;
14577 case 2: size = "WORD"; break;
14578 case 4: size = "DWORD"; break;
14579 case 8: size = "QWORD"; break;
14580 case 12: size = "TBYTE"; break;
14581 case 16:
14582 if (GET_MODE (x) == XFmode)
14583 size = "TBYTE";
14584 else
14585 size = "XMMWORD";
14586 break;
14587 case 32: size = "YMMWORD"; break;
14588 default:
14589 gcc_unreachable ();
14592 /* Check for explicit size override (codes 'b', 'w', 'k',
14593 'q' and 'x') */
14594 if (code == 'b')
14595 size = "BYTE";
14596 else if (code == 'w')
14597 size = "WORD";
14598 else if (code == 'k')
14599 size = "DWORD";
14600 else if (code == 'q')
14601 size = "QWORD";
14602 else if (code == 'x')
14603 size = "XMMWORD";
14605 fputs (size, file);
14606 fputs (" PTR ", file);
14609 x = XEXP (x, 0);
14610 /* Avoid (%rip) for call operands. */
14611 if (CONSTANT_ADDRESS_P (x) && code == 'P'
14612 && !CONST_INT_P (x))
14613 output_addr_const (file, x);
14614 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
14615 output_operand_lossage ("invalid constraints for operand");
14616 else
14617 output_address (x);
14620 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
14622 REAL_VALUE_TYPE r;
14623 long l;
14625 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14626 REAL_VALUE_TO_TARGET_SINGLE (r, l);
14628 if (ASSEMBLER_DIALECT == ASM_ATT)
14629 putc ('$', file);
14630 /* Sign extend 32bit SFmode immediate to 8 bytes. */
14631 if (code == 'q')
14632 fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
14633 (unsigned long long) (int) l);
14634 else
14635 fprintf (file, "0x%08x", (unsigned int) l);
14638 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
14640 REAL_VALUE_TYPE r;
14641 long l[2];
14643 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14644 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
14646 if (ASSEMBLER_DIALECT == ASM_ATT)
14647 putc ('$', file);
14648 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
14651 /* These float cases don't actually occur as immediate operands. */
14652 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
14654 char dstr[30];
14656 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
14657 fputs (dstr, file);
14660 else
14662 /* We have patterns that allow zero sets of memory, for instance.
14663 In 64-bit mode, we should probably support all 8-byte vectors,
14664 since we can in fact encode that into an immediate. */
14665 if (GET_CODE (x) == CONST_VECTOR)
14667 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
14668 x = const0_rtx;
14671 if (code != 'P' && code != 'p')
14673 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
14675 if (ASSEMBLER_DIALECT == ASM_ATT)
14676 putc ('$', file);
14678 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
14679 || GET_CODE (x) == LABEL_REF)
14681 if (ASSEMBLER_DIALECT == ASM_ATT)
14682 putc ('$', file);
14683 else
14684 fputs ("OFFSET FLAT:", file);
14687 if (CONST_INT_P (x))
14688 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
14689 else if (flag_pic || MACHOPIC_INDIRECT)
14690 output_pic_addr_const (file, x, code);
14691 else
14692 output_addr_const (file, x);
14696 static bool
14697 ix86_print_operand_punct_valid_p (unsigned char code)
14699 return (code == '@' || code == '*' || code == '+' || code == '&'
14700 || code == ';' || code == '~' || code == '^');
14703 /* Print a memory operand whose address is ADDR. */
14705 static void
14706 ix86_print_operand_address (FILE *file, rtx addr)
14708 struct ix86_address parts;
14709 rtx base, index, disp;
14710 int scale;
14711 int ok;
14712 bool vsib = false;
14713 int code = 0;
14715 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
14717 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14718 gcc_assert (parts.index == NULL_RTX);
14719 parts.index = XVECEXP (addr, 0, 1);
14720 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
14721 addr = XVECEXP (addr, 0, 0);
14722 vsib = true;
14724 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
14726 gcc_assert (TARGET_64BIT);
14727 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14728 code = 'q';
14730 else
14731 ok = ix86_decompose_address (addr, &parts);
14733 gcc_assert (ok);
14735 base = parts.base;
14736 index = parts.index;
14737 disp = parts.disp;
14738 scale = parts.scale;
14740 switch (parts.seg)
14742 case SEG_DEFAULT:
14743 break;
14744 case SEG_FS:
14745 case SEG_GS:
14746 if (ASSEMBLER_DIALECT == ASM_ATT)
14747 putc ('%', file);
14748 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
14749 break;
14750 default:
14751 gcc_unreachable ();
14754 /* Use one byte shorter RIP relative addressing for 64bit mode. */
14755 if (TARGET_64BIT && !base && !index)
14757 rtx symbol = disp;
14759 if (GET_CODE (disp) == CONST
14760 && GET_CODE (XEXP (disp, 0)) == PLUS
14761 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14762 symbol = XEXP (XEXP (disp, 0), 0);
14764 if (GET_CODE (symbol) == LABEL_REF
14765 || (GET_CODE (symbol) == SYMBOL_REF
14766 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
14767 base = pc_rtx;
14769 if (!base && !index)
14771 /* Displacement only requires special attention. */
14773 if (CONST_INT_P (disp))
14775 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
14776 fputs ("ds:", file);
14777 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
14779 else if (flag_pic)
14780 output_pic_addr_const (file, disp, 0);
14781 else
14782 output_addr_const (file, disp);
14784 else
14786 /* Print SImode register names to force addr32 prefix. */
14787 if (SImode_address_operand (addr, VOIDmode))
14789 #ifdef ENABLE_CHECKING
14790 gcc_assert (TARGET_64BIT);
14791 switch (GET_CODE (addr))
14793 case SUBREG:
14794 gcc_assert (GET_MODE (addr) == SImode);
14795 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
14796 break;
14797 case ZERO_EXTEND:
14798 case AND:
14799 gcc_assert (GET_MODE (addr) == DImode);
14800 break;
14801 default:
14802 gcc_unreachable ();
14804 #endif
14805 gcc_assert (!code);
14806 code = 'k';
14808 else if (code == 0
14809 && TARGET_X32
14810 && disp
14811 && CONST_INT_P (disp)
14812 && INTVAL (disp) < -16*1024*1024)
14814 /* X32 runs in 64-bit mode, where displacement, DISP, in
14815 address DISP(%r64), is encoded as 32-bit immediate sign-
14816 extended from 32-bit to 64-bit. For -0x40000300(%r64),
14817 address is %r64 + 0xffffffffbffffd00. When %r64 <
14818 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
14819 which is invalid for x32. The correct address is %r64
14820 - 0x40000300 == 0xf7ffdd64. To properly encode
14821 -0x40000300(%r64) for x32, we zero-extend negative
14822 displacement by forcing addr32 prefix which truncates
14823 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
14824 zero-extend all negative displacements, including -1(%rsp).
14825 However, for small negative displacements, sign-extension
14826 won't cause overflow. We only zero-extend negative
14827 displacements if they < -16*1024*1024, which is also used
14828 to check legitimate address displacements for PIC. */
14829 code = 'k';
14832 if (ASSEMBLER_DIALECT == ASM_ATT)
14834 if (disp)
14836 if (flag_pic)
14837 output_pic_addr_const (file, disp, 0);
14838 else if (GET_CODE (disp) == LABEL_REF)
14839 output_asm_label (disp);
14840 else
14841 output_addr_const (file, disp);
14844 putc ('(', file);
14845 if (base)
14846 print_reg (base, code, file);
14847 if (index)
14849 putc (',', file);
14850 print_reg (index, vsib ? 0 : code, file);
14851 if (scale != 1 || vsib)
14852 fprintf (file, ",%d", scale);
14854 putc (')', file);
14856 else
14858 rtx offset = NULL_RTX;
14860 if (disp)
14862 /* Pull out the offset of a symbol; print any symbol itself. */
14863 if (GET_CODE (disp) == CONST
14864 && GET_CODE (XEXP (disp, 0)) == PLUS
14865 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14867 offset = XEXP (XEXP (disp, 0), 1);
14868 disp = gen_rtx_CONST (VOIDmode,
14869 XEXP (XEXP (disp, 0), 0));
14872 if (flag_pic)
14873 output_pic_addr_const (file, disp, 0);
14874 else if (GET_CODE (disp) == LABEL_REF)
14875 output_asm_label (disp);
14876 else if (CONST_INT_P (disp))
14877 offset = disp;
14878 else
14879 output_addr_const (file, disp);
14882 putc ('[', file);
14883 if (base)
14885 print_reg (base, code, file);
14886 if (offset)
14888 if (INTVAL (offset) >= 0)
14889 putc ('+', file);
14890 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14893 else if (offset)
14894 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14895 else
14896 putc ('0', file);
14898 if (index)
14900 putc ('+', file);
14901 print_reg (index, vsib ? 0 : code, file);
14902 if (scale != 1 || vsib)
14903 fprintf (file, "*%d", scale);
14905 putc (']', file);
14910 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
14912 static bool
14913 i386_asm_output_addr_const_extra (FILE *file, rtx x)
14915 rtx op;
14917 if (GET_CODE (x) != UNSPEC)
14918 return false;
14920 op = XVECEXP (x, 0, 0);
14921 switch (XINT (x, 1))
14923 case UNSPEC_GOTTPOFF:
14924 output_addr_const (file, op);
14925 /* FIXME: This might be @TPOFF in Sun ld. */
14926 fputs ("@gottpoff", file);
14927 break;
14928 case UNSPEC_TPOFF:
14929 output_addr_const (file, op);
14930 fputs ("@tpoff", file);
14931 break;
14932 case UNSPEC_NTPOFF:
14933 output_addr_const (file, op);
14934 if (TARGET_64BIT)
14935 fputs ("@tpoff", file);
14936 else
14937 fputs ("@ntpoff", file);
14938 break;
14939 case UNSPEC_DTPOFF:
14940 output_addr_const (file, op);
14941 fputs ("@dtpoff", file);
14942 break;
14943 case UNSPEC_GOTNTPOFF:
14944 output_addr_const (file, op);
14945 if (TARGET_64BIT)
14946 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
14947 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
14948 else
14949 fputs ("@gotntpoff", file);
14950 break;
14951 case UNSPEC_INDNTPOFF:
14952 output_addr_const (file, op);
14953 fputs ("@indntpoff", file);
14954 break;
14955 #if TARGET_MACHO
14956 case UNSPEC_MACHOPIC_OFFSET:
14957 output_addr_const (file, op);
14958 putc ('-', file);
14959 machopic_output_function_base_name (file);
14960 break;
14961 #endif
14963 case UNSPEC_STACK_CHECK:
14965 int offset;
14967 gcc_assert (flag_split_stack);
14969 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
14970 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
14971 #else
14972 gcc_unreachable ();
14973 #endif
14975 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
14977 break;
14979 default:
14980 return false;
14983 return true;
14986 /* Split one or more double-mode RTL references into pairs of half-mode
14987 references. The RTL can be REG, offsettable MEM, integer constant, or
14988 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
14989 split and "num" is its length. lo_half and hi_half are output arrays
14990 that parallel "operands". */
14992 void
14993 split_double_mode (enum machine_mode mode, rtx operands[],
14994 int num, rtx lo_half[], rtx hi_half[])
14996 enum machine_mode half_mode;
14997 unsigned int byte;
14999 switch (mode)
15001 case TImode:
15002 half_mode = DImode;
15003 break;
15004 case DImode:
15005 half_mode = SImode;
15006 break;
15007 default:
15008 gcc_unreachable ();
15011 byte = GET_MODE_SIZE (half_mode);
15013 while (num--)
15015 rtx op = operands[num];
15017 /* simplify_subreg refuse to split volatile memory addresses,
15018 but we still have to handle it. */
15019 if (MEM_P (op))
15021 lo_half[num] = adjust_address (op, half_mode, 0);
15022 hi_half[num] = adjust_address (op, half_mode, byte);
15024 else
15026 lo_half[num] = simplify_gen_subreg (half_mode, op,
15027 GET_MODE (op) == VOIDmode
15028 ? mode : GET_MODE (op), 0);
15029 hi_half[num] = simplify_gen_subreg (half_mode, op,
15030 GET_MODE (op) == VOIDmode
15031 ? mode : GET_MODE (op), byte);
15036 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
15037 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
15038 is the expression of the binary operation. The output may either be
15039 emitted here, or returned to the caller, like all output_* functions.
15041 There is no guarantee that the operands are the same mode, as they
15042 might be within FLOAT or FLOAT_EXTEND expressions. */
15044 #ifndef SYSV386_COMPAT
15045 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
15046 wants to fix the assemblers because that causes incompatibility
15047 with gcc. No-one wants to fix gcc because that causes
15048 incompatibility with assemblers... You can use the option of
15049 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
15050 #define SYSV386_COMPAT 1
15051 #endif
15053 const char *
15054 output_387_binary_op (rtx insn, rtx *operands)
15056 static char buf[40];
15057 const char *p;
15058 const char *ssep;
15059 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
15061 #ifdef ENABLE_CHECKING
15062 /* Even if we do not want to check the inputs, this documents input
15063 constraints. Which helps in understanding the following code. */
15064 if (STACK_REG_P (operands[0])
15065 && ((REG_P (operands[1])
15066 && REGNO (operands[0]) == REGNO (operands[1])
15067 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
15068 || (REG_P (operands[2])
15069 && REGNO (operands[0]) == REGNO (operands[2])
15070 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
15071 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
15072 ; /* ok */
15073 else
15074 gcc_assert (is_sse);
15075 #endif
15077 switch (GET_CODE (operands[3]))
15079 case PLUS:
15080 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15081 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15082 p = "fiadd";
15083 else
15084 p = "fadd";
15085 ssep = "vadd";
15086 break;
15088 case MINUS:
15089 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15090 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15091 p = "fisub";
15092 else
15093 p = "fsub";
15094 ssep = "vsub";
15095 break;
15097 case MULT:
15098 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15099 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15100 p = "fimul";
15101 else
15102 p = "fmul";
15103 ssep = "vmul";
15104 break;
15106 case DIV:
15107 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15108 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15109 p = "fidiv";
15110 else
15111 p = "fdiv";
15112 ssep = "vdiv";
15113 break;
15115 default:
15116 gcc_unreachable ();
15119 if (is_sse)
15121 if (TARGET_AVX)
15123 strcpy (buf, ssep);
15124 if (GET_MODE (operands[0]) == SFmode)
15125 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
15126 else
15127 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
15129 else
15131 strcpy (buf, ssep + 1);
15132 if (GET_MODE (operands[0]) == SFmode)
15133 strcat (buf, "ss\t{%2, %0|%0, %2}");
15134 else
15135 strcat (buf, "sd\t{%2, %0|%0, %2}");
15137 return buf;
15139 strcpy (buf, p);
15141 switch (GET_CODE (operands[3]))
15143 case MULT:
15144 case PLUS:
15145 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
15147 rtx temp = operands[2];
15148 operands[2] = operands[1];
15149 operands[1] = temp;
15152 /* know operands[0] == operands[1]. */
15154 if (MEM_P (operands[2]))
15156 p = "%Z2\t%2";
15157 break;
15160 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15162 if (STACK_TOP_P (operands[0]))
15163 /* How is it that we are storing to a dead operand[2]?
15164 Well, presumably operands[1] is dead too. We can't
15165 store the result to st(0) as st(0) gets popped on this
15166 instruction. Instead store to operands[2] (which I
15167 think has to be st(1)). st(1) will be popped later.
15168 gcc <= 2.8.1 didn't have this check and generated
15169 assembly code that the Unixware assembler rejected. */
15170 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15171 else
15172 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15173 break;
15176 if (STACK_TOP_P (operands[0]))
15177 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15178 else
15179 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15180 break;
15182 case MINUS:
15183 case DIV:
15184 if (MEM_P (operands[1]))
15186 p = "r%Z1\t%1";
15187 break;
15190 if (MEM_P (operands[2]))
15192 p = "%Z2\t%2";
15193 break;
15196 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15198 #if SYSV386_COMPAT
15199 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
15200 derived assemblers, confusingly reverse the direction of
15201 the operation for fsub{r} and fdiv{r} when the
15202 destination register is not st(0). The Intel assembler
15203 doesn't have this brain damage. Read !SYSV386_COMPAT to
15204 figure out what the hardware really does. */
15205 if (STACK_TOP_P (operands[0]))
15206 p = "{p\t%0, %2|rp\t%2, %0}";
15207 else
15208 p = "{rp\t%2, %0|p\t%0, %2}";
15209 #else
15210 if (STACK_TOP_P (operands[0]))
15211 /* As above for fmul/fadd, we can't store to st(0). */
15212 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15213 else
15214 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15215 #endif
15216 break;
15219 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
15221 #if SYSV386_COMPAT
15222 if (STACK_TOP_P (operands[0]))
15223 p = "{rp\t%0, %1|p\t%1, %0}";
15224 else
15225 p = "{p\t%1, %0|rp\t%0, %1}";
15226 #else
15227 if (STACK_TOP_P (operands[0]))
15228 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
15229 else
15230 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
15231 #endif
15232 break;
15235 if (STACK_TOP_P (operands[0]))
15237 if (STACK_TOP_P (operands[1]))
15238 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15239 else
15240 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
15241 break;
15243 else if (STACK_TOP_P (operands[1]))
15245 #if SYSV386_COMPAT
15246 p = "{\t%1, %0|r\t%0, %1}";
15247 #else
15248 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
15249 #endif
15251 else
15253 #if SYSV386_COMPAT
15254 p = "{r\t%2, %0|\t%0, %2}";
15255 #else
15256 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15257 #endif
15259 break;
15261 default:
15262 gcc_unreachable ();
15265 strcat (buf, p);
15266 return buf;
15269 /* Check if a 256bit AVX register is referenced inside of EXP. */
15271 static int
15272 ix86_check_avx256_register (rtx *pexp, void *data ATTRIBUTE_UNUSED)
15274 rtx exp = *pexp;
15276 if (GET_CODE (exp) == SUBREG)
15277 exp = SUBREG_REG (exp);
15279 if (REG_P (exp)
15280 && VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp)))
15281 return 1;
15283 return 0;
15286 /* Return needed mode for entity in optimize_mode_switching pass. */
15288 static int
15289 ix86_avx_u128_mode_needed (rtx insn)
15291 if (CALL_P (insn))
15293 rtx link;
15295 /* Needed mode is set to AVX_U128_CLEAN if there are
15296 no 256bit modes used in function arguments. */
15297 for (link = CALL_INSN_FUNCTION_USAGE (insn);
15298 link;
15299 link = XEXP (link, 1))
15301 if (GET_CODE (XEXP (link, 0)) == USE)
15303 rtx arg = XEXP (XEXP (link, 0), 0);
15305 if (ix86_check_avx256_register (&arg, NULL))
15306 return AVX_U128_ANY;
15310 return AVX_U128_CLEAN;
15313 /* Require DIRTY mode if a 256bit AVX register is referenced. Hardware
15314 changes state only when a 256bit register is written to, but we need
15315 to prevent the compiler from moving optimal insertion point above
15316 eventual read from 256bit register. */
15317 if (for_each_rtx (&PATTERN (insn), ix86_check_avx256_register, NULL))
15318 return AVX_U128_DIRTY;
15320 return AVX_U128_ANY;
15323 /* Return mode that i387 must be switched into
15324 prior to the execution of insn. */
15326 static int
15327 ix86_i387_mode_needed (int entity, rtx insn)
15329 enum attr_i387_cw mode;
15331 /* The mode UNINITIALIZED is used to store control word after a
15332 function call or ASM pattern. The mode ANY specify that function
15333 has no requirements on the control word and make no changes in the
15334 bits we are interested in. */
15336 if (CALL_P (insn)
15337 || (NONJUMP_INSN_P (insn)
15338 && (asm_noperands (PATTERN (insn)) >= 0
15339 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
15340 return I387_CW_UNINITIALIZED;
15342 if (recog_memoized (insn) < 0)
15343 return I387_CW_ANY;
15345 mode = get_attr_i387_cw (insn);
15347 switch (entity)
15349 case I387_TRUNC:
15350 if (mode == I387_CW_TRUNC)
15351 return mode;
15352 break;
15354 case I387_FLOOR:
15355 if (mode == I387_CW_FLOOR)
15356 return mode;
15357 break;
15359 case I387_CEIL:
15360 if (mode == I387_CW_CEIL)
15361 return mode;
15362 break;
15364 case I387_MASK_PM:
15365 if (mode == I387_CW_MASK_PM)
15366 return mode;
15367 break;
15369 default:
15370 gcc_unreachable ();
15373 return I387_CW_ANY;
15376 /* Return mode that entity must be switched into
15377 prior to the execution of insn. */
15380 ix86_mode_needed (int entity, rtx insn)
15382 switch (entity)
15384 case AVX_U128:
15385 return ix86_avx_u128_mode_needed (insn);
15386 case I387_TRUNC:
15387 case I387_FLOOR:
15388 case I387_CEIL:
15389 case I387_MASK_PM:
15390 return ix86_i387_mode_needed (entity, insn);
15391 default:
15392 gcc_unreachable ();
15394 return 0;
15397 /* Check if a 256bit AVX register is referenced in stores. */
15399 static void
15400 ix86_check_avx256_stores (rtx dest, const_rtx set ATTRIBUTE_UNUSED, void *data)
15402 if (ix86_check_avx256_register (&dest, NULL))
15404 bool *used = (bool *) data;
15405 *used = true;
15409 /* Calculate mode of upper 128bit AVX registers after the insn. */
15411 static int
15412 ix86_avx_u128_mode_after (int mode, rtx insn)
15414 rtx pat = PATTERN (insn);
15416 if (vzeroupper_operation (pat, VOIDmode)
15417 || vzeroall_operation (pat, VOIDmode))
15418 return AVX_U128_CLEAN;
15420 /* We know that state is clean after CALL insn if there are no
15421 256bit registers used in the function return register. */
15422 if (CALL_P (insn))
15424 bool avx_reg256_found = false;
15425 note_stores (pat, ix86_check_avx256_stores, &avx_reg256_found);
15426 if (!avx_reg256_found)
15427 return AVX_U128_CLEAN;
15430 /* Otherwise, return current mode. Remember that if insn
15431 references AVX 256bit registers, the mode was already changed
15432 to DIRTY from MODE_NEEDED. */
15433 return mode;
15436 /* Return the mode that an insn results in. */
15439 ix86_mode_after (int entity, int mode, rtx insn)
15441 switch (entity)
15443 case AVX_U128:
15444 return ix86_avx_u128_mode_after (mode, insn);
15445 case I387_TRUNC:
15446 case I387_FLOOR:
15447 case I387_CEIL:
15448 case I387_MASK_PM:
15449 return mode;
15450 default:
15451 gcc_unreachable ();
15455 static int
15456 ix86_avx_u128_mode_entry (void)
15458 tree arg;
15460 /* Entry mode is set to AVX_U128_DIRTY if there are
15461 256bit modes used in function arguments. */
15462 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
15463 arg = TREE_CHAIN (arg))
15465 rtx incoming = DECL_INCOMING_RTL (arg);
15467 if (incoming && ix86_check_avx256_register (&incoming, NULL))
15468 return AVX_U128_DIRTY;
15471 return AVX_U128_CLEAN;
15474 /* Return a mode that ENTITY is assumed to be
15475 switched to at function entry. */
15478 ix86_mode_entry (int entity)
15480 switch (entity)
15482 case AVX_U128:
15483 return ix86_avx_u128_mode_entry ();
15484 case I387_TRUNC:
15485 case I387_FLOOR:
15486 case I387_CEIL:
15487 case I387_MASK_PM:
15488 return I387_CW_ANY;
15489 default:
15490 gcc_unreachable ();
15494 static int
15495 ix86_avx_u128_mode_exit (void)
15497 rtx reg = crtl->return_rtx;
15499 /* Exit mode is set to AVX_U128_DIRTY if there are
15500 256bit modes used in the function return register. */
15501 if (reg && ix86_check_avx256_register (&reg, NULL))
15502 return AVX_U128_DIRTY;
15504 return AVX_U128_CLEAN;
15507 /* Return a mode that ENTITY is assumed to be
15508 switched to at function exit. */
15511 ix86_mode_exit (int entity)
15513 switch (entity)
15515 case AVX_U128:
15516 return ix86_avx_u128_mode_exit ();
15517 case I387_TRUNC:
15518 case I387_FLOOR:
15519 case I387_CEIL:
15520 case I387_MASK_PM:
15521 return I387_CW_ANY;
15522 default:
15523 gcc_unreachable ();
15527 /* Output code to initialize control word copies used by trunc?f?i and
15528 rounding patterns. CURRENT_MODE is set to current control word,
15529 while NEW_MODE is set to new control word. */
15531 static void
15532 emit_i387_cw_initialization (int mode)
15534 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
15535 rtx new_mode;
15537 enum ix86_stack_slot slot;
15539 rtx reg = gen_reg_rtx (HImode);
15541 emit_insn (gen_x86_fnstcw_1 (stored_mode));
15542 emit_move_insn (reg, copy_rtx (stored_mode));
15544 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
15545 || optimize_function_for_size_p (cfun))
15547 switch (mode)
15549 case I387_CW_TRUNC:
15550 /* round toward zero (truncate) */
15551 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
15552 slot = SLOT_CW_TRUNC;
15553 break;
15555 case I387_CW_FLOOR:
15556 /* round down toward -oo */
15557 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15558 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
15559 slot = SLOT_CW_FLOOR;
15560 break;
15562 case I387_CW_CEIL:
15563 /* round up toward +oo */
15564 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15565 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
15566 slot = SLOT_CW_CEIL;
15567 break;
15569 case I387_CW_MASK_PM:
15570 /* mask precision exception for nearbyint() */
15571 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15572 slot = SLOT_CW_MASK_PM;
15573 break;
15575 default:
15576 gcc_unreachable ();
15579 else
15581 switch (mode)
15583 case I387_CW_TRUNC:
15584 /* round toward zero (truncate) */
15585 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
15586 slot = SLOT_CW_TRUNC;
15587 break;
15589 case I387_CW_FLOOR:
15590 /* round down toward -oo */
15591 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
15592 slot = SLOT_CW_FLOOR;
15593 break;
15595 case I387_CW_CEIL:
15596 /* round up toward +oo */
15597 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
15598 slot = SLOT_CW_CEIL;
15599 break;
15601 case I387_CW_MASK_PM:
15602 /* mask precision exception for nearbyint() */
15603 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15604 slot = SLOT_CW_MASK_PM;
15605 break;
15607 default:
15608 gcc_unreachable ();
15612 gcc_assert (slot < MAX_386_STACK_LOCALS);
15614 new_mode = assign_386_stack_local (HImode, slot);
15615 emit_move_insn (new_mode, reg);
15618 /* Emit vzeroupper. */
15620 void
15621 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
15623 int i;
15625 /* Cancel automatic vzeroupper insertion if there are
15626 live call-saved SSE registers at the insertion point. */
15628 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
15629 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
15630 return;
15632 if (TARGET_64BIT)
15633 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
15634 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
15635 return;
15637 emit_insn (gen_avx_vzeroupper ());
15640 /* Generate one or more insns to set ENTITY to MODE. */
15642 void
15643 ix86_emit_mode_set (int entity, int mode, HARD_REG_SET regs_live)
15645 switch (entity)
15647 case AVX_U128:
15648 if (mode == AVX_U128_CLEAN)
15649 ix86_avx_emit_vzeroupper (regs_live);
15650 break;
15651 case I387_TRUNC:
15652 case I387_FLOOR:
15653 case I387_CEIL:
15654 case I387_MASK_PM:
15655 if (mode != I387_CW_ANY
15656 && mode != I387_CW_UNINITIALIZED)
15657 emit_i387_cw_initialization (mode);
15658 break;
15659 default:
15660 gcc_unreachable ();
15664 /* Output code for INSN to convert a float to a signed int. OPERANDS
15665 are the insn operands. The output may be [HSD]Imode and the input
15666 operand may be [SDX]Fmode. */
15668 const char *
15669 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
15671 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15672 int dimode_p = GET_MODE (operands[0]) == DImode;
15673 int round_mode = get_attr_i387_cw (insn);
15675 /* Jump through a hoop or two for DImode, since the hardware has no
15676 non-popping instruction. We used to do this a different way, but
15677 that was somewhat fragile and broke with post-reload splitters. */
15678 if ((dimode_p || fisttp) && !stack_top_dies)
15679 output_asm_insn ("fld\t%y1", operands);
15681 gcc_assert (STACK_TOP_P (operands[1]));
15682 gcc_assert (MEM_P (operands[0]));
15683 gcc_assert (GET_MODE (operands[1]) != TFmode);
15685 if (fisttp)
15686 output_asm_insn ("fisttp%Z0\t%0", operands);
15687 else
15689 if (round_mode != I387_CW_ANY)
15690 output_asm_insn ("fldcw\t%3", operands);
15691 if (stack_top_dies || dimode_p)
15692 output_asm_insn ("fistp%Z0\t%0", operands);
15693 else
15694 output_asm_insn ("fist%Z0\t%0", operands);
15695 if (round_mode != I387_CW_ANY)
15696 output_asm_insn ("fldcw\t%2", operands);
15699 return "";
15702 /* Output code for x87 ffreep insn. The OPNO argument, which may only
15703 have the values zero or one, indicates the ffreep insn's operand
15704 from the OPERANDS array. */
15706 static const char *
15707 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
15709 if (TARGET_USE_FFREEP)
15710 #ifdef HAVE_AS_IX86_FFREEP
15711 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
15712 #else
15714 static char retval[32];
15715 int regno = REGNO (operands[opno]);
15717 gcc_assert (STACK_REGNO_P (regno));
15719 regno -= FIRST_STACK_REG;
15721 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
15722 return retval;
15724 #endif
15726 return opno ? "fstp\t%y1" : "fstp\t%y0";
15730 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
15731 should be used. UNORDERED_P is true when fucom should be used. */
15733 const char *
15734 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
15736 int stack_top_dies;
15737 rtx cmp_op0, cmp_op1;
15738 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
15740 if (eflags_p)
15742 cmp_op0 = operands[0];
15743 cmp_op1 = operands[1];
15745 else
15747 cmp_op0 = operands[1];
15748 cmp_op1 = operands[2];
15751 if (is_sse)
15753 if (GET_MODE (operands[0]) == SFmode)
15754 if (unordered_p)
15755 return "%vucomiss\t{%1, %0|%0, %1}";
15756 else
15757 return "%vcomiss\t{%1, %0|%0, %1}";
15758 else
15759 if (unordered_p)
15760 return "%vucomisd\t{%1, %0|%0, %1}";
15761 else
15762 return "%vcomisd\t{%1, %0|%0, %1}";
15765 gcc_assert (STACK_TOP_P (cmp_op0));
15767 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15769 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
15771 if (stack_top_dies)
15773 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
15774 return output_387_ffreep (operands, 1);
15776 else
15777 return "ftst\n\tfnstsw\t%0";
15780 if (STACK_REG_P (cmp_op1)
15781 && stack_top_dies
15782 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
15783 && REGNO (cmp_op1) != FIRST_STACK_REG)
15785 /* If both the top of the 387 stack dies, and the other operand
15786 is also a stack register that dies, then this must be a
15787 `fcompp' float compare */
15789 if (eflags_p)
15791 /* There is no double popping fcomi variant. Fortunately,
15792 eflags is immune from the fstp's cc clobbering. */
15793 if (unordered_p)
15794 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
15795 else
15796 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
15797 return output_387_ffreep (operands, 0);
15799 else
15801 if (unordered_p)
15802 return "fucompp\n\tfnstsw\t%0";
15803 else
15804 return "fcompp\n\tfnstsw\t%0";
15807 else
15809 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
15811 static const char * const alt[16] =
15813 "fcom%Z2\t%y2\n\tfnstsw\t%0",
15814 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
15815 "fucom%Z2\t%y2\n\tfnstsw\t%0",
15816 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
15818 "ficom%Z2\t%y2\n\tfnstsw\t%0",
15819 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
15820 NULL,
15821 NULL,
15823 "fcomi\t{%y1, %0|%0, %y1}",
15824 "fcomip\t{%y1, %0|%0, %y1}",
15825 "fucomi\t{%y1, %0|%0, %y1}",
15826 "fucomip\t{%y1, %0|%0, %y1}",
15828 NULL,
15829 NULL,
15830 NULL,
15831 NULL
15834 int mask;
15835 const char *ret;
15837 mask = eflags_p << 3;
15838 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
15839 mask |= unordered_p << 1;
15840 mask |= stack_top_dies;
15842 gcc_assert (mask < 16);
15843 ret = alt[mask];
15844 gcc_assert (ret);
15846 return ret;
15850 void
15851 ix86_output_addr_vec_elt (FILE *file, int value)
15853 const char *directive = ASM_LONG;
15855 #ifdef ASM_QUAD
15856 if (TARGET_LP64)
15857 directive = ASM_QUAD;
15858 #else
15859 gcc_assert (!TARGET_64BIT);
15860 #endif
15862 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
15865 void
15866 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
15868 const char *directive = ASM_LONG;
15870 #ifdef ASM_QUAD
15871 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
15872 directive = ASM_QUAD;
15873 #else
15874 gcc_assert (!TARGET_64BIT);
15875 #endif
15876 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
15877 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
15878 fprintf (file, "%s%s%d-%s%d\n",
15879 directive, LPREFIX, value, LPREFIX, rel);
15880 else if (HAVE_AS_GOTOFF_IN_DATA)
15881 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
15882 #if TARGET_MACHO
15883 else if (TARGET_MACHO)
15885 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
15886 machopic_output_function_base_name (file);
15887 putc ('\n', file);
15889 #endif
15890 else
15891 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
15892 GOT_SYMBOL_NAME, LPREFIX, value);
15895 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
15896 for the target. */
15898 void
15899 ix86_expand_clear (rtx dest)
15901 rtx tmp;
15903 /* We play register width games, which are only valid after reload. */
15904 gcc_assert (reload_completed);
15906 /* Avoid HImode and its attendant prefix byte. */
15907 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
15908 dest = gen_rtx_REG (SImode, REGNO (dest));
15909 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
15911 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
15912 if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
15914 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
15915 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
15918 emit_insn (tmp);
15921 /* X is an unchanging MEM. If it is a constant pool reference, return
15922 the constant pool rtx, else NULL. */
15925 maybe_get_pool_constant (rtx x)
15927 x = ix86_delegitimize_address (XEXP (x, 0));
15929 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
15930 return get_pool_constant (x);
15932 return NULL_RTX;
15935 void
15936 ix86_expand_move (enum machine_mode mode, rtx operands[])
15938 rtx op0, op1;
15939 enum tls_model model;
15941 op0 = operands[0];
15942 op1 = operands[1];
15944 if (GET_CODE (op1) == SYMBOL_REF)
15946 model = SYMBOL_REF_TLS_MODEL (op1);
15947 if (model)
15949 op1 = legitimize_tls_address (op1, model, true);
15950 op1 = force_operand (op1, op0);
15951 if (op1 == op0)
15952 return;
15953 op1 = convert_to_mode (mode, op1, 1);
15955 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15956 && SYMBOL_REF_DLLIMPORT_P (op1))
15957 op1 = legitimize_dllimport_symbol (op1, false);
15959 else if (GET_CODE (op1) == CONST
15960 && GET_CODE (XEXP (op1, 0)) == PLUS
15961 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
15963 rtx addend = XEXP (XEXP (op1, 0), 1);
15964 rtx symbol = XEXP (XEXP (op1, 0), 0);
15965 rtx tmp = NULL;
15967 model = SYMBOL_REF_TLS_MODEL (symbol);
15968 if (model)
15969 tmp = legitimize_tls_address (symbol, model, true);
15970 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15971 && SYMBOL_REF_DLLIMPORT_P (symbol))
15972 tmp = legitimize_dllimport_symbol (symbol, true);
15974 if (tmp)
15976 tmp = force_operand (tmp, NULL);
15977 tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
15978 op0, 1, OPTAB_DIRECT);
15979 if (tmp == op0)
15980 return;
15981 op1 = convert_to_mode (mode, tmp, 1);
15985 if ((flag_pic || MACHOPIC_INDIRECT)
15986 && symbolic_operand (op1, mode))
15988 if (TARGET_MACHO && !TARGET_64BIT)
15990 #if TARGET_MACHO
15991 /* dynamic-no-pic */
15992 if (MACHOPIC_INDIRECT)
15994 rtx temp = ((reload_in_progress
15995 || ((op0 && REG_P (op0))
15996 && mode == Pmode))
15997 ? op0 : gen_reg_rtx (Pmode));
15998 op1 = machopic_indirect_data_reference (op1, temp);
15999 if (MACHOPIC_PURE)
16000 op1 = machopic_legitimize_pic_address (op1, mode,
16001 temp == op1 ? 0 : temp);
16003 if (op0 != op1 && GET_CODE (op0) != MEM)
16005 rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
16006 emit_insn (insn);
16007 return;
16009 if (GET_CODE (op0) == MEM)
16010 op1 = force_reg (Pmode, op1);
16011 else
16013 rtx temp = op0;
16014 if (GET_CODE (temp) != REG)
16015 temp = gen_reg_rtx (Pmode);
16016 temp = legitimize_pic_address (op1, temp);
16017 if (temp == op0)
16018 return;
16019 op1 = temp;
16021 /* dynamic-no-pic */
16022 #endif
16024 else
16026 if (MEM_P (op0))
16027 op1 = force_reg (mode, op1);
16028 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
16030 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
16031 op1 = legitimize_pic_address (op1, reg);
16032 if (op0 == op1)
16033 return;
16034 op1 = convert_to_mode (mode, op1, 1);
16038 else
16040 if (MEM_P (op0)
16041 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
16042 || !push_operand (op0, mode))
16043 && MEM_P (op1))
16044 op1 = force_reg (mode, op1);
16046 if (push_operand (op0, mode)
16047 && ! general_no_elim_operand (op1, mode))
16048 op1 = copy_to_mode_reg (mode, op1);
16050 /* Force large constants in 64bit compilation into register
16051 to get them CSEed. */
16052 if (can_create_pseudo_p ()
16053 && (mode == DImode) && TARGET_64BIT
16054 && immediate_operand (op1, mode)
16055 && !x86_64_zext_immediate_operand (op1, VOIDmode)
16056 && !register_operand (op0, mode)
16057 && optimize)
16058 op1 = copy_to_mode_reg (mode, op1);
16060 if (can_create_pseudo_p ()
16061 && FLOAT_MODE_P (mode)
16062 && GET_CODE (op1) == CONST_DOUBLE)
16064 /* If we are loading a floating point constant to a register,
16065 force the value to memory now, since we'll get better code
16066 out the back end. */
16068 op1 = validize_mem (force_const_mem (mode, op1));
16069 if (!register_operand (op0, mode))
16071 rtx temp = gen_reg_rtx (mode);
16072 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
16073 emit_move_insn (op0, temp);
16074 return;
16079 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16082 void
16083 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
16085 rtx op0 = operands[0], op1 = operands[1];
16086 unsigned int align = GET_MODE_ALIGNMENT (mode);
16088 /* Force constants other than zero into memory. We do not know how
16089 the instructions used to build constants modify the upper 64 bits
16090 of the register, once we have that information we may be able
16091 to handle some of them more efficiently. */
16092 if (can_create_pseudo_p ()
16093 && register_operand (op0, mode)
16094 && (CONSTANT_P (op1)
16095 || (GET_CODE (op1) == SUBREG
16096 && CONSTANT_P (SUBREG_REG (op1))))
16097 && !standard_sse_constant_p (op1))
16098 op1 = validize_mem (force_const_mem (mode, op1));
16100 /* We need to check memory alignment for SSE mode since attribute
16101 can make operands unaligned. */
16102 if (can_create_pseudo_p ()
16103 && SSE_REG_MODE_P (mode)
16104 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
16105 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
16107 rtx tmp[2];
16109 /* ix86_expand_vector_move_misalign() does not like constants ... */
16110 if (CONSTANT_P (op1)
16111 || (GET_CODE (op1) == SUBREG
16112 && CONSTANT_P (SUBREG_REG (op1))))
16113 op1 = validize_mem (force_const_mem (mode, op1));
16115 /* ... nor both arguments in memory. */
16116 if (!register_operand (op0, mode)
16117 && !register_operand (op1, mode))
16118 op1 = force_reg (mode, op1);
16120 tmp[0] = op0; tmp[1] = op1;
16121 ix86_expand_vector_move_misalign (mode, tmp);
16122 return;
16125 /* Make operand1 a register if it isn't already. */
16126 if (can_create_pseudo_p ()
16127 && !register_operand (op0, mode)
16128 && !register_operand (op1, mode))
16130 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
16131 return;
16134 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16137 /* Split 32-byte AVX unaligned load and store if needed. */
16139 static void
16140 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
16142 rtx m;
16143 rtx (*extract) (rtx, rtx, rtx);
16144 rtx (*load_unaligned) (rtx, rtx);
16145 rtx (*store_unaligned) (rtx, rtx);
16146 enum machine_mode mode;
16148 switch (GET_MODE (op0))
16150 default:
16151 gcc_unreachable ();
16152 case V32QImode:
16153 extract = gen_avx_vextractf128v32qi;
16154 load_unaligned = gen_avx_loaddqu256;
16155 store_unaligned = gen_avx_storedqu256;
16156 mode = V16QImode;
16157 break;
16158 case V8SFmode:
16159 extract = gen_avx_vextractf128v8sf;
16160 load_unaligned = gen_avx_loadups256;
16161 store_unaligned = gen_avx_storeups256;
16162 mode = V4SFmode;
16163 break;
16164 case V4DFmode:
16165 extract = gen_avx_vextractf128v4df;
16166 load_unaligned = gen_avx_loadupd256;
16167 store_unaligned = gen_avx_storeupd256;
16168 mode = V2DFmode;
16169 break;
16172 if (MEM_P (op1))
16174 if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
16176 rtx r = gen_reg_rtx (mode);
16177 m = adjust_address (op1, mode, 0);
16178 emit_move_insn (r, m);
16179 m = adjust_address (op1, mode, 16);
16180 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
16181 emit_move_insn (op0, r);
16183 else
16184 emit_insn (load_unaligned (op0, op1));
16186 else if (MEM_P (op0))
16188 if (TARGET_AVX256_SPLIT_UNALIGNED_STORE)
16190 m = adjust_address (op0, mode, 0);
16191 emit_insn (extract (m, op1, const0_rtx));
16192 m = adjust_address (op0, mode, 16);
16193 emit_insn (extract (m, op1, const1_rtx));
16195 else
16196 emit_insn (store_unaligned (op0, op1));
16198 else
16199 gcc_unreachable ();
16202 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
16203 straight to ix86_expand_vector_move. */
16204 /* Code generation for scalar reg-reg moves of single and double precision data:
16205 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
16206 movaps reg, reg
16207 else
16208 movss reg, reg
16209 if (x86_sse_partial_reg_dependency == true)
16210 movapd reg, reg
16211 else
16212 movsd reg, reg
16214 Code generation for scalar loads of double precision data:
16215 if (x86_sse_split_regs == true)
16216 movlpd mem, reg (gas syntax)
16217 else
16218 movsd mem, reg
16220 Code generation for unaligned packed loads of single precision data
16221 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
16222 if (x86_sse_unaligned_move_optimal)
16223 movups mem, reg
16225 if (x86_sse_partial_reg_dependency == true)
16227 xorps reg, reg
16228 movlps mem, reg
16229 movhps mem+8, reg
16231 else
16233 movlps mem, reg
16234 movhps mem+8, reg
16237 Code generation for unaligned packed loads of double precision data
16238 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
16239 if (x86_sse_unaligned_move_optimal)
16240 movupd mem, reg
16242 if (x86_sse_split_regs == true)
16244 movlpd mem, reg
16245 movhpd mem+8, reg
16247 else
16249 movsd mem, reg
16250 movhpd mem+8, reg
16254 void
16255 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
16257 rtx op0, op1, m;
16259 op0 = operands[0];
16260 op1 = operands[1];
16262 if (TARGET_AVX
16263 && GET_MODE_SIZE (mode) == 32)
16265 switch (GET_MODE_CLASS (mode))
16267 case MODE_VECTOR_INT:
16268 case MODE_INT:
16269 op0 = gen_lowpart (V32QImode, op0);
16270 op1 = gen_lowpart (V32QImode, op1);
16271 /* FALLTHRU */
16273 case MODE_VECTOR_FLOAT:
16274 ix86_avx256_split_vector_move_misalign (op0, op1);
16275 break;
16277 default:
16278 gcc_unreachable ();
16281 return;
16284 if (MEM_P (op1))
16286 /* ??? If we have typed data, then it would appear that using
16287 movdqu is the only way to get unaligned data loaded with
16288 integer type. */
16289 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16291 op0 = gen_lowpart (V16QImode, op0);
16292 op1 = gen_lowpart (V16QImode, op1);
16293 /* We will eventually emit movups based on insn attributes. */
16294 emit_insn (gen_sse2_loaddqu (op0, op1));
16296 else if (TARGET_SSE2 && mode == V2DFmode)
16298 rtx zero;
16300 if (TARGET_AVX
16301 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16302 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16303 || optimize_function_for_size_p (cfun))
16305 /* We will eventually emit movups based on insn attributes. */
16306 emit_insn (gen_sse2_loadupd (op0, op1));
16307 return;
16310 /* When SSE registers are split into halves, we can avoid
16311 writing to the top half twice. */
16312 if (TARGET_SSE_SPLIT_REGS)
16314 emit_clobber (op0);
16315 zero = op0;
16317 else
16319 /* ??? Not sure about the best option for the Intel chips.
16320 The following would seem to satisfy; the register is
16321 entirely cleared, breaking the dependency chain. We
16322 then store to the upper half, with a dependency depth
16323 of one. A rumor has it that Intel recommends two movsd
16324 followed by an unpacklpd, but this is unconfirmed. And
16325 given that the dependency depth of the unpacklpd would
16326 still be one, I'm not sure why this would be better. */
16327 zero = CONST0_RTX (V2DFmode);
16330 m = adjust_address (op1, DFmode, 0);
16331 emit_insn (gen_sse2_loadlpd (op0, zero, m));
16332 m = adjust_address (op1, DFmode, 8);
16333 emit_insn (gen_sse2_loadhpd (op0, op0, m));
16335 else
16337 if (TARGET_AVX
16338 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16339 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16340 || optimize_function_for_size_p (cfun))
16342 op0 = gen_lowpart (V4SFmode, op0);
16343 op1 = gen_lowpart (V4SFmode, op1);
16344 emit_insn (gen_sse_loadups (op0, op1));
16345 return;
16348 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
16349 emit_move_insn (op0, CONST0_RTX (mode));
16350 else
16351 emit_clobber (op0);
16353 if (mode != V4SFmode)
16354 op0 = gen_lowpart (V4SFmode, op0);
16356 m = adjust_address (op1, V2SFmode, 0);
16357 emit_insn (gen_sse_loadlps (op0, op0, m));
16358 m = adjust_address (op1, V2SFmode, 8);
16359 emit_insn (gen_sse_loadhps (op0, op0, m));
16362 else if (MEM_P (op0))
16364 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16366 op0 = gen_lowpart (V16QImode, op0);
16367 op1 = gen_lowpart (V16QImode, op1);
16368 /* We will eventually emit movups based on insn attributes. */
16369 emit_insn (gen_sse2_storedqu (op0, op1));
16371 else if (TARGET_SSE2 && mode == V2DFmode)
16373 if (TARGET_AVX
16374 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16375 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16376 || optimize_function_for_size_p (cfun))
16377 /* We will eventually emit movups based on insn attributes. */
16378 emit_insn (gen_sse2_storeupd (op0, op1));
16379 else
16381 m = adjust_address (op0, DFmode, 0);
16382 emit_insn (gen_sse2_storelpd (m, op1));
16383 m = adjust_address (op0, DFmode, 8);
16384 emit_insn (gen_sse2_storehpd (m, op1));
16387 else
16389 if (mode != V4SFmode)
16390 op1 = gen_lowpart (V4SFmode, op1);
16392 if (TARGET_AVX
16393 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16394 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16395 || optimize_function_for_size_p (cfun))
16397 op0 = gen_lowpart (V4SFmode, op0);
16398 emit_insn (gen_sse_storeups (op0, op1));
16400 else
16402 m = adjust_address (op0, V2SFmode, 0);
16403 emit_insn (gen_sse_storelps (m, op1));
16404 m = adjust_address (op0, V2SFmode, 8);
16405 emit_insn (gen_sse_storehps (m, op1));
16409 else
16410 gcc_unreachable ();
16413 /* Expand a push in MODE. This is some mode for which we do not support
16414 proper push instructions, at least from the registers that we expect
16415 the value to live in. */
16417 void
16418 ix86_expand_push (enum machine_mode mode, rtx x)
16420 rtx tmp;
16422 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
16423 GEN_INT (-GET_MODE_SIZE (mode)),
16424 stack_pointer_rtx, 1, OPTAB_DIRECT);
16425 if (tmp != stack_pointer_rtx)
16426 emit_move_insn (stack_pointer_rtx, tmp);
16428 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
16430 /* When we push an operand onto stack, it has to be aligned at least
16431 at the function argument boundary. However since we don't have
16432 the argument type, we can't determine the actual argument
16433 boundary. */
16434 emit_move_insn (tmp, x);
16437 /* Helper function of ix86_fixup_binary_operands to canonicalize
16438 operand order. Returns true if the operands should be swapped. */
16440 static bool
16441 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
16442 rtx operands[])
16444 rtx dst = operands[0];
16445 rtx src1 = operands[1];
16446 rtx src2 = operands[2];
16448 /* If the operation is not commutative, we can't do anything. */
16449 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
16450 return false;
16452 /* Highest priority is that src1 should match dst. */
16453 if (rtx_equal_p (dst, src1))
16454 return false;
16455 if (rtx_equal_p (dst, src2))
16456 return true;
16458 /* Next highest priority is that immediate constants come second. */
16459 if (immediate_operand (src2, mode))
16460 return false;
16461 if (immediate_operand (src1, mode))
16462 return true;
16464 /* Lowest priority is that memory references should come second. */
16465 if (MEM_P (src2))
16466 return false;
16467 if (MEM_P (src1))
16468 return true;
16470 return false;
16474 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
16475 destination to use for the operation. If different from the true
16476 destination in operands[0], a copy operation will be required. */
16479 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
16480 rtx operands[])
16482 rtx dst = operands[0];
16483 rtx src1 = operands[1];
16484 rtx src2 = operands[2];
16486 /* Canonicalize operand order. */
16487 if (ix86_swap_binary_operands_p (code, mode, operands))
16489 rtx temp;
16491 /* It is invalid to swap operands of different modes. */
16492 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
16494 temp = src1;
16495 src1 = src2;
16496 src2 = temp;
16499 /* Both source operands cannot be in memory. */
16500 if (MEM_P (src1) && MEM_P (src2))
16502 /* Optimization: Only read from memory once. */
16503 if (rtx_equal_p (src1, src2))
16505 src2 = force_reg (mode, src2);
16506 src1 = src2;
16508 else
16509 src2 = force_reg (mode, src2);
16512 /* If the destination is memory, and we do not have matching source
16513 operands, do things in registers. */
16514 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16515 dst = gen_reg_rtx (mode);
16517 /* Source 1 cannot be a constant. */
16518 if (CONSTANT_P (src1))
16519 src1 = force_reg (mode, src1);
16521 /* Source 1 cannot be a non-matching memory. */
16522 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16523 src1 = force_reg (mode, src1);
16525 /* Improve address combine. */
16526 if (code == PLUS
16527 && GET_MODE_CLASS (mode) == MODE_INT
16528 && MEM_P (src2))
16529 src2 = force_reg (mode, src2);
16531 operands[1] = src1;
16532 operands[2] = src2;
16533 return dst;
16536 /* Similarly, but assume that the destination has already been
16537 set up properly. */
16539 void
16540 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
16541 enum machine_mode mode, rtx operands[])
16543 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
16544 gcc_assert (dst == operands[0]);
16547 /* Attempt to expand a binary operator. Make the expansion closer to the
16548 actual machine, then just general_operand, which will allow 3 separate
16549 memory references (one output, two input) in a single insn. */
16551 void
16552 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
16553 rtx operands[])
16555 rtx src1, src2, dst, op, clob;
16557 dst = ix86_fixup_binary_operands (code, mode, operands);
16558 src1 = operands[1];
16559 src2 = operands[2];
16561 /* Emit the instruction. */
16563 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
16564 if (reload_in_progress)
16566 /* Reload doesn't know about the flags register, and doesn't know that
16567 it doesn't want to clobber it. We can only do this with PLUS. */
16568 gcc_assert (code == PLUS);
16569 emit_insn (op);
16571 else if (reload_completed
16572 && code == PLUS
16573 && !rtx_equal_p (dst, src1))
16575 /* This is going to be an LEA; avoid splitting it later. */
16576 emit_insn (op);
16578 else
16580 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16581 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16584 /* Fix up the destination if needed. */
16585 if (dst != operands[0])
16586 emit_move_insn (operands[0], dst);
16589 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
16590 the given OPERANDS. */
16592 void
16593 ix86_expand_vector_logical_operator (enum rtx_code code, enum machine_mode mode,
16594 rtx operands[])
16596 rtx op1 = NULL_RTX, op2 = NULL_RTX;
16597 if (GET_CODE (operands[1]) == SUBREG)
16599 op1 = operands[1];
16600 op2 = operands[2];
16602 else if (GET_CODE (operands[2]) == SUBREG)
16604 op1 = operands[2];
16605 op2 = operands[1];
16607 /* Optimize (__m128i) d | (__m128i) e and similar code
16608 when d and e are float vectors into float vector logical
16609 insn. In C/C++ without using intrinsics there is no other way
16610 to express vector logical operation on float vectors than
16611 to cast them temporarily to integer vectors. */
16612 if (op1
16613 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16614 && ((GET_CODE (op2) == SUBREG || GET_CODE (op2) == CONST_VECTOR))
16615 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
16616 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
16617 && SUBREG_BYTE (op1) == 0
16618 && (GET_CODE (op2) == CONST_VECTOR
16619 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
16620 && SUBREG_BYTE (op2) == 0))
16621 && can_create_pseudo_p ())
16623 rtx dst;
16624 switch (GET_MODE (SUBREG_REG (op1)))
16626 case V4SFmode:
16627 case V8SFmode:
16628 case V2DFmode:
16629 case V4DFmode:
16630 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
16631 if (GET_CODE (op2) == CONST_VECTOR)
16633 op2 = gen_lowpart (GET_MODE (dst), op2);
16634 op2 = force_reg (GET_MODE (dst), op2);
16636 else
16638 op1 = operands[1];
16639 op2 = SUBREG_REG (operands[2]);
16640 if (!nonimmediate_operand (op2, GET_MODE (dst)))
16641 op2 = force_reg (GET_MODE (dst), op2);
16643 op1 = SUBREG_REG (op1);
16644 if (!nonimmediate_operand (op1, GET_MODE (dst)))
16645 op1 = force_reg (GET_MODE (dst), op1);
16646 emit_insn (gen_rtx_SET (VOIDmode, dst,
16647 gen_rtx_fmt_ee (code, GET_MODE (dst),
16648 op1, op2)));
16649 emit_move_insn (operands[0], gen_lowpart (mode, dst));
16650 return;
16651 default:
16652 break;
16655 if (!nonimmediate_operand (operands[1], mode))
16656 operands[1] = force_reg (mode, operands[1]);
16657 if (!nonimmediate_operand (operands[2], mode))
16658 operands[2] = force_reg (mode, operands[2]);
16659 ix86_fixup_binary_operands_no_copy (code, mode, operands);
16660 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
16661 gen_rtx_fmt_ee (code, mode, operands[1],
16662 operands[2])));
16665 /* Return TRUE or FALSE depending on whether the binary operator meets the
16666 appropriate constraints. */
16668 bool
16669 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
16670 rtx operands[3])
16672 rtx dst = operands[0];
16673 rtx src1 = operands[1];
16674 rtx src2 = operands[2];
16676 /* Both source operands cannot be in memory. */
16677 if (MEM_P (src1) && MEM_P (src2))
16678 return false;
16680 /* Canonicalize operand order for commutative operators. */
16681 if (ix86_swap_binary_operands_p (code, mode, operands))
16683 rtx temp = src1;
16684 src1 = src2;
16685 src2 = temp;
16688 /* If the destination is memory, we must have a matching source operand. */
16689 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16690 return false;
16692 /* Source 1 cannot be a constant. */
16693 if (CONSTANT_P (src1))
16694 return false;
16696 /* Source 1 cannot be a non-matching memory. */
16697 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16698 /* Support "andhi/andsi/anddi" as a zero-extending move. */
16699 return (code == AND
16700 && (mode == HImode
16701 || mode == SImode
16702 || (TARGET_64BIT && mode == DImode))
16703 && satisfies_constraint_L (src2));
16705 return true;
16708 /* Attempt to expand a unary operator. Make the expansion closer to the
16709 actual machine, then just general_operand, which will allow 2 separate
16710 memory references (one output, one input) in a single insn. */
16712 void
16713 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
16714 rtx operands[])
16716 int matching_memory;
16717 rtx src, dst, op, clob;
16719 dst = operands[0];
16720 src = operands[1];
16722 /* If the destination is memory, and we do not have matching source
16723 operands, do things in registers. */
16724 matching_memory = 0;
16725 if (MEM_P (dst))
16727 if (rtx_equal_p (dst, src))
16728 matching_memory = 1;
16729 else
16730 dst = gen_reg_rtx (mode);
16733 /* When source operand is memory, destination must match. */
16734 if (MEM_P (src) && !matching_memory)
16735 src = force_reg (mode, src);
16737 /* Emit the instruction. */
16739 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
16740 if (reload_in_progress || code == NOT)
16742 /* Reload doesn't know about the flags register, and doesn't know that
16743 it doesn't want to clobber it. */
16744 gcc_assert (code == NOT);
16745 emit_insn (op);
16747 else
16749 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16750 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16753 /* Fix up the destination if needed. */
16754 if (dst != operands[0])
16755 emit_move_insn (operands[0], dst);
16758 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
16759 divisor are within the range [0-255]. */
16761 void
16762 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
16763 bool signed_p)
16765 rtx end_label, qimode_label;
16766 rtx insn, div, mod;
16767 rtx scratch, tmp0, tmp1, tmp2;
16768 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
16769 rtx (*gen_zero_extend) (rtx, rtx);
16770 rtx (*gen_test_ccno_1) (rtx, rtx);
16772 switch (mode)
16774 case SImode:
16775 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
16776 gen_test_ccno_1 = gen_testsi_ccno_1;
16777 gen_zero_extend = gen_zero_extendqisi2;
16778 break;
16779 case DImode:
16780 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
16781 gen_test_ccno_1 = gen_testdi_ccno_1;
16782 gen_zero_extend = gen_zero_extendqidi2;
16783 break;
16784 default:
16785 gcc_unreachable ();
16788 end_label = gen_label_rtx ();
16789 qimode_label = gen_label_rtx ();
16791 scratch = gen_reg_rtx (mode);
16793 /* Use 8bit unsigned divimod if dividend and divisor are within
16794 the range [0-255]. */
16795 emit_move_insn (scratch, operands[2]);
16796 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
16797 scratch, 1, OPTAB_DIRECT);
16798 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
16799 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
16800 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
16801 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
16802 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
16803 pc_rtx);
16804 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
16805 predict_jump (REG_BR_PROB_BASE * 50 / 100);
16806 JUMP_LABEL (insn) = qimode_label;
16808 /* Generate original signed/unsigned divimod. */
16809 div = gen_divmod4_1 (operands[0], operands[1],
16810 operands[2], operands[3]);
16811 emit_insn (div);
16813 /* Branch to the end. */
16814 emit_jump_insn (gen_jump (end_label));
16815 emit_barrier ();
16817 /* Generate 8bit unsigned divide. */
16818 emit_label (qimode_label);
16819 /* Don't use operands[0] for result of 8bit divide since not all
16820 registers support QImode ZERO_EXTRACT. */
16821 tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
16822 tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
16823 tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
16824 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
16826 if (signed_p)
16828 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
16829 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
16831 else
16833 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
16834 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
16837 /* Extract remainder from AH. */
16838 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
16839 if (REG_P (operands[1]))
16840 insn = emit_move_insn (operands[1], tmp1);
16841 else
16843 /* Need a new scratch register since the old one has result
16844 of 8bit divide. */
16845 scratch = gen_reg_rtx (mode);
16846 emit_move_insn (scratch, tmp1);
16847 insn = emit_move_insn (operands[1], scratch);
16849 set_unique_reg_note (insn, REG_EQUAL, mod);
16851 /* Zero extend quotient from AL. */
16852 tmp1 = gen_lowpart (QImode, tmp0);
16853 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
16854 set_unique_reg_note (insn, REG_EQUAL, div);
16856 emit_label (end_label);
16859 #define LEA_MAX_STALL (3)
16860 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
16862 /* Increase given DISTANCE in half-cycles according to
16863 dependencies between PREV and NEXT instructions.
16864 Add 1 half-cycle if there is no dependency and
16865 go to next cycle if there is some dependecy. */
16867 static unsigned int
16868 increase_distance (rtx prev, rtx next, unsigned int distance)
16870 df_ref *use_rec;
16871 df_ref *def_rec;
16873 if (!prev || !next)
16874 return distance + (distance & 1) + 2;
16876 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
16877 return distance + 1;
16879 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
16880 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
16881 if (!DF_REF_IS_ARTIFICIAL (*def_rec)
16882 && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
16883 return distance + (distance & 1) + 2;
16885 return distance + 1;
16888 /* Function checks if instruction INSN defines register number
16889 REGNO1 or REGNO2. */
16891 static bool
16892 insn_defines_reg (unsigned int regno1, unsigned int regno2,
16893 rtx insn)
16895 df_ref *def_rec;
16897 for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
16898 if (DF_REF_REG_DEF_P (*def_rec)
16899 && !DF_REF_IS_ARTIFICIAL (*def_rec)
16900 && (regno1 == DF_REF_REGNO (*def_rec)
16901 || regno2 == DF_REF_REGNO (*def_rec)))
16903 return true;
16906 return false;
16909 /* Function checks if instruction INSN uses register number
16910 REGNO as a part of address expression. */
16912 static bool
16913 insn_uses_reg_mem (unsigned int regno, rtx insn)
16915 df_ref *use_rec;
16917 for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
16918 if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
16919 return true;
16921 return false;
16924 /* Search backward for non-agu definition of register number REGNO1
16925 or register number REGNO2 in basic block starting from instruction
16926 START up to head of basic block or instruction INSN.
16928 Function puts true value into *FOUND var if definition was found
16929 and false otherwise.
16931 Distance in half-cycles between START and found instruction or head
16932 of BB is added to DISTANCE and returned. */
16934 static int
16935 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
16936 rtx insn, int distance,
16937 rtx start, bool *found)
16939 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16940 rtx prev = start;
16941 rtx next = NULL;
16943 *found = false;
16945 while (prev
16946 && prev != insn
16947 && distance < LEA_SEARCH_THRESHOLD)
16949 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
16951 distance = increase_distance (prev, next, distance);
16952 if (insn_defines_reg (regno1, regno2, prev))
16954 if (recog_memoized (prev) < 0
16955 || get_attr_type (prev) != TYPE_LEA)
16957 *found = true;
16958 return distance;
16962 next = prev;
16964 if (prev == BB_HEAD (bb))
16965 break;
16967 prev = PREV_INSN (prev);
16970 return distance;
16973 /* Search backward for non-agu definition of register number REGNO1
16974 or register number REGNO2 in INSN's basic block until
16975 1. Pass LEA_SEARCH_THRESHOLD instructions, or
16976 2. Reach neighbour BBs boundary, or
16977 3. Reach agu definition.
16978 Returns the distance between the non-agu definition point and INSN.
16979 If no definition point, returns -1. */
16981 static int
16982 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
16983 rtx insn)
16985 basic_block bb = BLOCK_FOR_INSN (insn);
16986 int distance = 0;
16987 bool found = false;
16989 if (insn != BB_HEAD (bb))
16990 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
16991 distance, PREV_INSN (insn),
16992 &found);
16994 if (!found && distance < LEA_SEARCH_THRESHOLD)
16996 edge e;
16997 edge_iterator ei;
16998 bool simple_loop = false;
17000 FOR_EACH_EDGE (e, ei, bb->preds)
17001 if (e->src == bb)
17003 simple_loop = true;
17004 break;
17007 if (simple_loop)
17008 distance = distance_non_agu_define_in_bb (regno1, regno2,
17009 insn, distance,
17010 BB_END (bb), &found);
17011 else
17013 int shortest_dist = -1;
17014 bool found_in_bb = false;
17016 FOR_EACH_EDGE (e, ei, bb->preds)
17018 int bb_dist
17019 = distance_non_agu_define_in_bb (regno1, regno2,
17020 insn, distance,
17021 BB_END (e->src),
17022 &found_in_bb);
17023 if (found_in_bb)
17025 if (shortest_dist < 0)
17026 shortest_dist = bb_dist;
17027 else if (bb_dist > 0)
17028 shortest_dist = MIN (bb_dist, shortest_dist);
17030 found = true;
17034 distance = shortest_dist;
17038 /* get_attr_type may modify recog data. We want to make sure
17039 that recog data is valid for instruction INSN, on which
17040 distance_non_agu_define is called. INSN is unchanged here. */
17041 extract_insn_cached (insn);
17043 if (!found)
17044 return -1;
17046 return distance >> 1;
17049 /* Return the distance in half-cycles between INSN and the next
17050 insn that uses register number REGNO in memory address added
17051 to DISTANCE. Return -1 if REGNO0 is set.
17053 Put true value into *FOUND if register usage was found and
17054 false otherwise.
17055 Put true value into *REDEFINED if register redefinition was
17056 found and false otherwise. */
17058 static int
17059 distance_agu_use_in_bb (unsigned int regno,
17060 rtx insn, int distance, rtx start,
17061 bool *found, bool *redefined)
17063 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
17064 rtx next = start;
17065 rtx prev = NULL;
17067 *found = false;
17068 *redefined = false;
17070 while (next
17071 && next != insn
17072 && distance < LEA_SEARCH_THRESHOLD)
17074 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
17076 distance = increase_distance(prev, next, distance);
17077 if (insn_uses_reg_mem (regno, next))
17079 /* Return DISTANCE if OP0 is used in memory
17080 address in NEXT. */
17081 *found = true;
17082 return distance;
17085 if (insn_defines_reg (regno, INVALID_REGNUM, next))
17087 /* Return -1 if OP0 is set in NEXT. */
17088 *redefined = true;
17089 return -1;
17092 prev = next;
17095 if (next == BB_END (bb))
17096 break;
17098 next = NEXT_INSN (next);
17101 return distance;
17104 /* Return the distance between INSN and the next insn that uses
17105 register number REGNO0 in memory address. Return -1 if no such
17106 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
17108 static int
17109 distance_agu_use (unsigned int regno0, rtx insn)
17111 basic_block bb = BLOCK_FOR_INSN (insn);
17112 int distance = 0;
17113 bool found = false;
17114 bool redefined = false;
17116 if (insn != BB_END (bb))
17117 distance = distance_agu_use_in_bb (regno0, insn, distance,
17118 NEXT_INSN (insn),
17119 &found, &redefined);
17121 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
17123 edge e;
17124 edge_iterator ei;
17125 bool simple_loop = false;
17127 FOR_EACH_EDGE (e, ei, bb->succs)
17128 if (e->dest == bb)
17130 simple_loop = true;
17131 break;
17134 if (simple_loop)
17135 distance = distance_agu_use_in_bb (regno0, insn,
17136 distance, BB_HEAD (bb),
17137 &found, &redefined);
17138 else
17140 int shortest_dist = -1;
17141 bool found_in_bb = false;
17142 bool redefined_in_bb = false;
17144 FOR_EACH_EDGE (e, ei, bb->succs)
17146 int bb_dist
17147 = distance_agu_use_in_bb (regno0, insn,
17148 distance, BB_HEAD (e->dest),
17149 &found_in_bb, &redefined_in_bb);
17150 if (found_in_bb)
17152 if (shortest_dist < 0)
17153 shortest_dist = bb_dist;
17154 else if (bb_dist > 0)
17155 shortest_dist = MIN (bb_dist, shortest_dist);
17157 found = true;
17161 distance = shortest_dist;
17165 if (!found || redefined)
17166 return -1;
17168 return distance >> 1;
17171 /* Define this macro to tune LEA priority vs ADD, it take effect when
17172 there is a dilemma of choicing LEA or ADD
17173 Negative value: ADD is more preferred than LEA
17174 Zero: Netrual
17175 Positive value: LEA is more preferred than ADD*/
17176 #define IX86_LEA_PRIORITY 0
17178 /* Return true if usage of lea INSN has performance advantage
17179 over a sequence of instructions. Instructions sequence has
17180 SPLIT_COST cycles higher latency than lea latency. */
17182 static bool
17183 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
17184 unsigned int regno2, int split_cost)
17186 int dist_define, dist_use;
17188 dist_define = distance_non_agu_define (regno1, regno2, insn);
17189 dist_use = distance_agu_use (regno0, insn);
17191 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
17193 /* If there is no non AGU operand definition, no AGU
17194 operand usage and split cost is 0 then both lea
17195 and non lea variants have same priority. Currently
17196 we prefer lea for 64 bit code and non lea on 32 bit
17197 code. */
17198 if (dist_use < 0 && split_cost == 0)
17199 return TARGET_64BIT || IX86_LEA_PRIORITY;
17200 else
17201 return true;
17204 /* With longer definitions distance lea is more preferable.
17205 Here we change it to take into account splitting cost and
17206 lea priority. */
17207 dist_define += split_cost + IX86_LEA_PRIORITY;
17209 /* If there is no use in memory addess then we just check
17210 that split cost exceeds AGU stall. */
17211 if (dist_use < 0)
17212 return dist_define > LEA_MAX_STALL;
17214 /* If this insn has both backward non-agu dependence and forward
17215 agu dependence, the one with short distance takes effect. */
17216 return dist_define >= dist_use;
17219 /* Return true if it is legal to clobber flags by INSN and
17220 false otherwise. */
17222 static bool
17223 ix86_ok_to_clobber_flags (rtx insn)
17225 basic_block bb = BLOCK_FOR_INSN (insn);
17226 df_ref *use;
17227 bitmap live;
17229 while (insn)
17231 if (NONDEBUG_INSN_P (insn))
17233 for (use = DF_INSN_USES (insn); *use; use++)
17234 if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
17235 return false;
17237 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
17238 return true;
17241 if (insn == BB_END (bb))
17242 break;
17244 insn = NEXT_INSN (insn);
17247 live = df_get_live_out(bb);
17248 return !REGNO_REG_SET_P (live, FLAGS_REG);
17251 /* Return true if we need to split op0 = op1 + op2 into a sequence of
17252 move and add to avoid AGU stalls. */
17254 bool
17255 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
17257 unsigned int regno0, regno1, regno2;
17259 /* Check if we need to optimize. */
17260 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17261 return false;
17263 /* Check it is correct to split here. */
17264 if (!ix86_ok_to_clobber_flags(insn))
17265 return false;
17267 regno0 = true_regnum (operands[0]);
17268 regno1 = true_regnum (operands[1]);
17269 regno2 = true_regnum (operands[2]);
17271 /* We need to split only adds with non destructive
17272 destination operand. */
17273 if (regno0 == regno1 || regno0 == regno2)
17274 return false;
17275 else
17276 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1);
17279 /* Return true if we should emit lea instruction instead of mov
17280 instruction. */
17282 bool
17283 ix86_use_lea_for_mov (rtx insn, rtx operands[])
17285 unsigned int regno0, regno1;
17287 /* Check if we need to optimize. */
17288 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17289 return false;
17291 /* Use lea for reg to reg moves only. */
17292 if (!REG_P (operands[0]) || !REG_P (operands[1]))
17293 return false;
17295 regno0 = true_regnum (operands[0]);
17296 regno1 = true_regnum (operands[1]);
17298 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0);
17301 /* Return true if we need to split lea into a sequence of
17302 instructions to avoid AGU stalls. */
17304 bool
17305 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
17307 unsigned int regno0, regno1, regno2;
17308 int split_cost;
17309 struct ix86_address parts;
17310 int ok;
17312 /* Check we need to optimize. */
17313 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17314 return false;
17316 /* Check it is correct to split here. */
17317 if (!ix86_ok_to_clobber_flags(insn))
17318 return false;
17320 ok = ix86_decompose_address (operands[1], &parts);
17321 gcc_assert (ok);
17323 /* There should be at least two components in the address. */
17324 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
17325 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
17326 return false;
17328 /* We should not split into add if non legitimate pic
17329 operand is used as displacement. */
17330 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
17331 return false;
17333 regno0 = true_regnum (operands[0]) ;
17334 regno1 = INVALID_REGNUM;
17335 regno2 = INVALID_REGNUM;
17337 if (parts.base)
17338 regno1 = true_regnum (parts.base);
17339 if (parts.index)
17340 regno2 = true_regnum (parts.index);
17342 split_cost = 0;
17344 /* Compute how many cycles we will add to execution time
17345 if split lea into a sequence of instructions. */
17346 if (parts.base || parts.index)
17348 /* Have to use mov instruction if non desctructive
17349 destination form is used. */
17350 if (regno1 != regno0 && regno2 != regno0)
17351 split_cost += 1;
17353 /* Have to add index to base if both exist. */
17354 if (parts.base && parts.index)
17355 split_cost += 1;
17357 /* Have to use shift and adds if scale is 2 or greater. */
17358 if (parts.scale > 1)
17360 if (regno0 != regno1)
17361 split_cost += 1;
17362 else if (regno2 == regno0)
17363 split_cost += 4;
17364 else
17365 split_cost += parts.scale;
17368 /* Have to use add instruction with immediate if
17369 disp is non zero. */
17370 if (parts.disp && parts.disp != const0_rtx)
17371 split_cost += 1;
17373 /* Subtract the price of lea. */
17374 split_cost -= 1;
17377 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost);
17380 /* Emit x86 binary operand CODE in mode MODE, where the first operand
17381 matches destination. RTX includes clobber of FLAGS_REG. */
17383 static void
17384 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
17385 rtx dst, rtx src)
17387 rtx op, clob;
17389 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
17390 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17392 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
17395 /* Return true if regno1 def is nearest to the insn. */
17397 static bool
17398 find_nearest_reg_def (rtx insn, int regno1, int regno2)
17400 rtx prev = insn;
17401 rtx start = BB_HEAD (BLOCK_FOR_INSN (insn));
17403 if (insn == start)
17404 return false;
17405 while (prev && prev != start)
17407 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
17409 prev = PREV_INSN (prev);
17410 continue;
17412 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
17413 return true;
17414 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
17415 return false;
17416 prev = PREV_INSN (prev);
17419 /* None of the regs is defined in the bb. */
17420 return false;
17423 /* Split lea instructions into a sequence of instructions
17424 which are executed on ALU to avoid AGU stalls.
17425 It is assumed that it is allowed to clobber flags register
17426 at lea position. */
17428 void
17429 ix86_split_lea_for_addr (rtx insn, rtx operands[], enum machine_mode mode)
17431 unsigned int regno0, regno1, regno2;
17432 struct ix86_address parts;
17433 rtx target, tmp;
17434 int ok, adds;
17436 ok = ix86_decompose_address (operands[1], &parts);
17437 gcc_assert (ok);
17439 target = gen_lowpart (mode, operands[0]);
17441 regno0 = true_regnum (target);
17442 regno1 = INVALID_REGNUM;
17443 regno2 = INVALID_REGNUM;
17445 if (parts.base)
17447 parts.base = gen_lowpart (mode, parts.base);
17448 regno1 = true_regnum (parts.base);
17451 if (parts.index)
17453 parts.index = gen_lowpart (mode, parts.index);
17454 regno2 = true_regnum (parts.index);
17457 if (parts.disp)
17458 parts.disp = gen_lowpart (mode, parts.disp);
17460 if (parts.scale > 1)
17462 /* Case r1 = r1 + ... */
17463 if (regno1 == regno0)
17465 /* If we have a case r1 = r1 + C * r1 then we
17466 should use multiplication which is very
17467 expensive. Assume cost model is wrong if we
17468 have such case here. */
17469 gcc_assert (regno2 != regno0);
17471 for (adds = parts.scale; adds > 0; adds--)
17472 ix86_emit_binop (PLUS, mode, target, parts.index);
17474 else
17476 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
17477 if (regno0 != regno2)
17478 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
17480 /* Use shift for scaling. */
17481 ix86_emit_binop (ASHIFT, mode, target,
17482 GEN_INT (exact_log2 (parts.scale)));
17484 if (parts.base)
17485 ix86_emit_binop (PLUS, mode, target, parts.base);
17487 if (parts.disp && parts.disp != const0_rtx)
17488 ix86_emit_binop (PLUS, mode, target, parts.disp);
17491 else if (!parts.base && !parts.index)
17493 gcc_assert(parts.disp);
17494 emit_insn (gen_rtx_SET (VOIDmode, target, parts.disp));
17496 else
17498 if (!parts.base)
17500 if (regno0 != regno2)
17501 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
17503 else if (!parts.index)
17505 if (regno0 != regno1)
17506 emit_insn (gen_rtx_SET (VOIDmode, target, parts.base));
17508 else
17510 if (regno0 == regno1)
17511 tmp = parts.index;
17512 else if (regno0 == regno2)
17513 tmp = parts.base;
17514 else
17516 rtx tmp1;
17518 /* Find better operand for SET instruction, depending
17519 on which definition is farther from the insn. */
17520 if (find_nearest_reg_def (insn, regno1, regno2))
17521 tmp = parts.index, tmp1 = parts.base;
17522 else
17523 tmp = parts.base, tmp1 = parts.index;
17525 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
17527 if (parts.disp && parts.disp != const0_rtx)
17528 ix86_emit_binop (PLUS, mode, target, parts.disp);
17530 ix86_emit_binop (PLUS, mode, target, tmp1);
17531 return;
17534 ix86_emit_binop (PLUS, mode, target, tmp);
17537 if (parts.disp && parts.disp != const0_rtx)
17538 ix86_emit_binop (PLUS, mode, target, parts.disp);
17542 /* Return true if it is ok to optimize an ADD operation to LEA
17543 operation to avoid flag register consumation. For most processors,
17544 ADD is faster than LEA. For the processors like ATOM, if the
17545 destination register of LEA holds an actual address which will be
17546 used soon, LEA is better and otherwise ADD is better. */
17548 bool
17549 ix86_lea_for_add_ok (rtx insn, rtx operands[])
17551 unsigned int regno0 = true_regnum (operands[0]);
17552 unsigned int regno1 = true_regnum (operands[1]);
17553 unsigned int regno2 = true_regnum (operands[2]);
17555 /* If a = b + c, (a!=b && a!=c), must use lea form. */
17556 if (regno0 != regno1 && regno0 != regno2)
17557 return true;
17559 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17560 return false;
17562 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0);
17565 /* Return true if destination reg of SET_BODY is shift count of
17566 USE_BODY. */
17568 static bool
17569 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
17571 rtx set_dest;
17572 rtx shift_rtx;
17573 int i;
17575 /* Retrieve destination of SET_BODY. */
17576 switch (GET_CODE (set_body))
17578 case SET:
17579 set_dest = SET_DEST (set_body);
17580 if (!set_dest || !REG_P (set_dest))
17581 return false;
17582 break;
17583 case PARALLEL:
17584 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
17585 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
17586 use_body))
17587 return true;
17588 default:
17589 return false;
17590 break;
17593 /* Retrieve shift count of USE_BODY. */
17594 switch (GET_CODE (use_body))
17596 case SET:
17597 shift_rtx = XEXP (use_body, 1);
17598 break;
17599 case PARALLEL:
17600 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
17601 if (ix86_dep_by_shift_count_body (set_body,
17602 XVECEXP (use_body, 0, i)))
17603 return true;
17604 default:
17605 return false;
17606 break;
17609 if (shift_rtx
17610 && (GET_CODE (shift_rtx) == ASHIFT
17611 || GET_CODE (shift_rtx) == LSHIFTRT
17612 || GET_CODE (shift_rtx) == ASHIFTRT
17613 || GET_CODE (shift_rtx) == ROTATE
17614 || GET_CODE (shift_rtx) == ROTATERT))
17616 rtx shift_count = XEXP (shift_rtx, 1);
17618 /* Return true if shift count is dest of SET_BODY. */
17619 if (REG_P (shift_count))
17621 /* Add check since it can be invoked before register
17622 allocation in pre-reload schedule. */
17623 if (reload_completed
17624 && true_regnum (set_dest) == true_regnum (shift_count))
17625 return true;
17626 else if (REGNO(set_dest) == REGNO(shift_count))
17627 return true;
17631 return false;
17634 /* Return true if destination reg of SET_INSN is shift count of
17635 USE_INSN. */
17637 bool
17638 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
17640 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
17641 PATTERN (use_insn));
17644 /* Return TRUE or FALSE depending on whether the unary operator meets the
17645 appropriate constraints. */
17647 bool
17648 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
17649 enum machine_mode mode ATTRIBUTE_UNUSED,
17650 rtx operands[2] ATTRIBUTE_UNUSED)
17652 /* If one of operands is memory, source and destination must match. */
17653 if ((MEM_P (operands[0])
17654 || MEM_P (operands[1]))
17655 && ! rtx_equal_p (operands[0], operands[1]))
17656 return false;
17657 return true;
17660 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
17661 are ok, keeping in mind the possible movddup alternative. */
17663 bool
17664 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
17666 if (MEM_P (operands[0]))
17667 return rtx_equal_p (operands[0], operands[1 + high]);
17668 if (MEM_P (operands[1]) && MEM_P (operands[2]))
17669 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
17670 return true;
17673 /* Post-reload splitter for converting an SF or DFmode value in an
17674 SSE register into an unsigned SImode. */
17676 void
17677 ix86_split_convert_uns_si_sse (rtx operands[])
17679 enum machine_mode vecmode;
17680 rtx value, large, zero_or_two31, input, two31, x;
17682 large = operands[1];
17683 zero_or_two31 = operands[2];
17684 input = operands[3];
17685 two31 = operands[4];
17686 vecmode = GET_MODE (large);
17687 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
17689 /* Load up the value into the low element. We must ensure that the other
17690 elements are valid floats -- zero is the easiest such value. */
17691 if (MEM_P (input))
17693 if (vecmode == V4SFmode)
17694 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
17695 else
17696 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
17698 else
17700 input = gen_rtx_REG (vecmode, REGNO (input));
17701 emit_move_insn (value, CONST0_RTX (vecmode));
17702 if (vecmode == V4SFmode)
17703 emit_insn (gen_sse_movss (value, value, input));
17704 else
17705 emit_insn (gen_sse2_movsd (value, value, input));
17708 emit_move_insn (large, two31);
17709 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
17711 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
17712 emit_insn (gen_rtx_SET (VOIDmode, large, x));
17714 x = gen_rtx_AND (vecmode, zero_or_two31, large);
17715 emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
17717 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
17718 emit_insn (gen_rtx_SET (VOIDmode, value, x));
17720 large = gen_rtx_REG (V4SImode, REGNO (large));
17721 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
17723 x = gen_rtx_REG (V4SImode, REGNO (value));
17724 if (vecmode == V4SFmode)
17725 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
17726 else
17727 emit_insn (gen_sse2_cvttpd2dq (x, value));
17728 value = x;
17730 emit_insn (gen_xorv4si3 (value, value, large));
17733 /* Convert an unsigned DImode value into a DFmode, using only SSE.
17734 Expects the 64-bit DImode to be supplied in a pair of integral
17735 registers. Requires SSE2; will use SSE3 if available. For x86_32,
17736 -mfpmath=sse, !optimize_size only. */
17738 void
17739 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
17741 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
17742 rtx int_xmm, fp_xmm;
17743 rtx biases, exponents;
17744 rtx x;
17746 int_xmm = gen_reg_rtx (V4SImode);
17747 if (TARGET_INTER_UNIT_MOVES)
17748 emit_insn (gen_movdi_to_sse (int_xmm, input));
17749 else if (TARGET_SSE_SPLIT_REGS)
17751 emit_clobber (int_xmm);
17752 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
17754 else
17756 x = gen_reg_rtx (V2DImode);
17757 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
17758 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
17761 x = gen_rtx_CONST_VECTOR (V4SImode,
17762 gen_rtvec (4, GEN_INT (0x43300000UL),
17763 GEN_INT (0x45300000UL),
17764 const0_rtx, const0_rtx));
17765 exponents = validize_mem (force_const_mem (V4SImode, x));
17767 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
17768 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
17770 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
17771 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
17772 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
17773 (0x1.0p84 + double(fp_value_hi_xmm)).
17774 Note these exponents differ by 32. */
17776 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
17778 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
17779 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
17780 real_ldexp (&bias_lo_rvt, &dconst1, 52);
17781 real_ldexp (&bias_hi_rvt, &dconst1, 84);
17782 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
17783 x = const_double_from_real_value (bias_hi_rvt, DFmode);
17784 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
17785 biases = validize_mem (force_const_mem (V2DFmode, biases));
17786 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
17788 /* Add the upper and lower DFmode values together. */
17789 if (TARGET_SSE3)
17790 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
17791 else
17793 x = copy_to_mode_reg (V2DFmode, fp_xmm);
17794 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
17795 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
17798 ix86_expand_vector_extract (false, target, fp_xmm, 0);
17801 /* Not used, but eases macroization of patterns. */
17802 void
17803 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
17804 rtx input ATTRIBUTE_UNUSED)
17806 gcc_unreachable ();
17809 /* Convert an unsigned SImode value into a DFmode. Only currently used
17810 for SSE, but applicable anywhere. */
17812 void
17813 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
17815 REAL_VALUE_TYPE TWO31r;
17816 rtx x, fp;
17818 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
17819 NULL, 1, OPTAB_DIRECT);
17821 fp = gen_reg_rtx (DFmode);
17822 emit_insn (gen_floatsidf2 (fp, x));
17824 real_ldexp (&TWO31r, &dconst1, 31);
17825 x = const_double_from_real_value (TWO31r, DFmode);
17827 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
17828 if (x != target)
17829 emit_move_insn (target, x);
17832 /* Convert a signed DImode value into a DFmode. Only used for SSE in
17833 32-bit mode; otherwise we have a direct convert instruction. */
17835 void
17836 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
17838 REAL_VALUE_TYPE TWO32r;
17839 rtx fp_lo, fp_hi, x;
17841 fp_lo = gen_reg_rtx (DFmode);
17842 fp_hi = gen_reg_rtx (DFmode);
17844 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
17846 real_ldexp (&TWO32r, &dconst1, 32);
17847 x = const_double_from_real_value (TWO32r, DFmode);
17848 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
17850 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
17852 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
17853 0, OPTAB_DIRECT);
17854 if (x != target)
17855 emit_move_insn (target, x);
17858 /* Convert an unsigned SImode value into a SFmode, using only SSE.
17859 For x86_32, -mfpmath=sse, !optimize_size only. */
17860 void
17861 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
17863 REAL_VALUE_TYPE ONE16r;
17864 rtx fp_hi, fp_lo, int_hi, int_lo, x;
17866 real_ldexp (&ONE16r, &dconst1, 16);
17867 x = const_double_from_real_value (ONE16r, SFmode);
17868 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
17869 NULL, 0, OPTAB_DIRECT);
17870 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
17871 NULL, 0, OPTAB_DIRECT);
17872 fp_hi = gen_reg_rtx (SFmode);
17873 fp_lo = gen_reg_rtx (SFmode);
17874 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
17875 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
17876 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
17877 0, OPTAB_DIRECT);
17878 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
17879 0, OPTAB_DIRECT);
17880 if (!rtx_equal_p (target, fp_hi))
17881 emit_move_insn (target, fp_hi);
17884 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
17885 a vector of unsigned ints VAL to vector of floats TARGET. */
17887 void
17888 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
17890 rtx tmp[8];
17891 REAL_VALUE_TYPE TWO16r;
17892 enum machine_mode intmode = GET_MODE (val);
17893 enum machine_mode fltmode = GET_MODE (target);
17894 rtx (*cvt) (rtx, rtx);
17896 if (intmode == V4SImode)
17897 cvt = gen_floatv4siv4sf2;
17898 else
17899 cvt = gen_floatv8siv8sf2;
17900 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
17901 tmp[0] = force_reg (intmode, tmp[0]);
17902 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
17903 OPTAB_DIRECT);
17904 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
17905 NULL_RTX, 1, OPTAB_DIRECT);
17906 tmp[3] = gen_reg_rtx (fltmode);
17907 emit_insn (cvt (tmp[3], tmp[1]));
17908 tmp[4] = gen_reg_rtx (fltmode);
17909 emit_insn (cvt (tmp[4], tmp[2]));
17910 real_ldexp (&TWO16r, &dconst1, 16);
17911 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
17912 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
17913 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
17914 OPTAB_DIRECT);
17915 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
17916 OPTAB_DIRECT);
17917 if (tmp[7] != target)
17918 emit_move_insn (target, tmp[7]);
17921 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
17922 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
17923 This is done by doing just signed conversion if < 0x1p31, and otherwise by
17924 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
17927 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
17929 REAL_VALUE_TYPE TWO31r;
17930 rtx two31r, tmp[4];
17931 enum machine_mode mode = GET_MODE (val);
17932 enum machine_mode scalarmode = GET_MODE_INNER (mode);
17933 enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
17934 rtx (*cmp) (rtx, rtx, rtx, rtx);
17935 int i;
17937 for (i = 0; i < 3; i++)
17938 tmp[i] = gen_reg_rtx (mode);
17939 real_ldexp (&TWO31r, &dconst1, 31);
17940 two31r = const_double_from_real_value (TWO31r, scalarmode);
17941 two31r = ix86_build_const_vector (mode, 1, two31r);
17942 two31r = force_reg (mode, two31r);
17943 switch (mode)
17945 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
17946 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
17947 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
17948 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
17949 default: gcc_unreachable ();
17951 tmp[3] = gen_rtx_LE (mode, two31r, val);
17952 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
17953 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
17954 0, OPTAB_DIRECT);
17955 if (intmode == V4SImode || TARGET_AVX2)
17956 *xorp = expand_simple_binop (intmode, ASHIFT,
17957 gen_lowpart (intmode, tmp[0]),
17958 GEN_INT (31), NULL_RTX, 0,
17959 OPTAB_DIRECT);
17960 else
17962 rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
17963 two31 = ix86_build_const_vector (intmode, 1, two31);
17964 *xorp = expand_simple_binop (intmode, AND,
17965 gen_lowpart (intmode, tmp[0]),
17966 two31, NULL_RTX, 0,
17967 OPTAB_DIRECT);
17969 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
17970 0, OPTAB_DIRECT);
17973 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
17974 then replicate the value for all elements of the vector
17975 register. */
17978 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
17980 int i, n_elt;
17981 rtvec v;
17982 enum machine_mode scalar_mode;
17984 switch (mode)
17986 case V32QImode:
17987 case V16QImode:
17988 case V16HImode:
17989 case V8HImode:
17990 case V8SImode:
17991 case V4SImode:
17992 case V4DImode:
17993 case V2DImode:
17994 gcc_assert (vect);
17995 case V8SFmode:
17996 case V4SFmode:
17997 case V4DFmode:
17998 case V2DFmode:
17999 n_elt = GET_MODE_NUNITS (mode);
18000 v = rtvec_alloc (n_elt);
18001 scalar_mode = GET_MODE_INNER (mode);
18003 RTVEC_ELT (v, 0) = value;
18005 for (i = 1; i < n_elt; ++i)
18006 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
18008 return gen_rtx_CONST_VECTOR (mode, v);
18010 default:
18011 gcc_unreachable ();
18015 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
18016 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
18017 for an SSE register. If VECT is true, then replicate the mask for
18018 all elements of the vector register. If INVERT is true, then create
18019 a mask excluding the sign bit. */
18022 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
18024 enum machine_mode vec_mode, imode;
18025 HOST_WIDE_INT hi, lo;
18026 int shift = 63;
18027 rtx v;
18028 rtx mask;
18030 /* Find the sign bit, sign extended to 2*HWI. */
18031 switch (mode)
18033 case V8SImode:
18034 case V4SImode:
18035 case V8SFmode:
18036 case V4SFmode:
18037 vec_mode = mode;
18038 mode = GET_MODE_INNER (mode);
18039 imode = SImode;
18040 lo = 0x80000000, hi = lo < 0;
18041 break;
18043 case V4DImode:
18044 case V2DImode:
18045 case V4DFmode:
18046 case V2DFmode:
18047 vec_mode = mode;
18048 mode = GET_MODE_INNER (mode);
18049 imode = DImode;
18050 if (HOST_BITS_PER_WIDE_INT >= 64)
18051 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
18052 else
18053 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18054 break;
18056 case TImode:
18057 case TFmode:
18058 vec_mode = VOIDmode;
18059 if (HOST_BITS_PER_WIDE_INT >= 64)
18061 imode = TImode;
18062 lo = 0, hi = (HOST_WIDE_INT)1 << shift;
18064 else
18066 rtvec vec;
18068 imode = DImode;
18069 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18071 if (invert)
18073 lo = ~lo, hi = ~hi;
18074 v = constm1_rtx;
18076 else
18077 v = const0_rtx;
18079 mask = immed_double_const (lo, hi, imode);
18081 vec = gen_rtvec (2, v, mask);
18082 v = gen_rtx_CONST_VECTOR (V2DImode, vec);
18083 v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
18085 return v;
18087 break;
18089 default:
18090 gcc_unreachable ();
18093 if (invert)
18094 lo = ~lo, hi = ~hi;
18096 /* Force this value into the low part of a fp vector constant. */
18097 mask = immed_double_const (lo, hi, imode);
18098 mask = gen_lowpart (mode, mask);
18100 if (vec_mode == VOIDmode)
18101 return force_reg (mode, mask);
18103 v = ix86_build_const_vector (vec_mode, vect, mask);
18104 return force_reg (vec_mode, v);
18107 /* Generate code for floating point ABS or NEG. */
18109 void
18110 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
18111 rtx operands[])
18113 rtx mask, set, dst, src;
18114 bool use_sse = false;
18115 bool vector_mode = VECTOR_MODE_P (mode);
18116 enum machine_mode vmode = mode;
18118 if (vector_mode)
18119 use_sse = true;
18120 else if (mode == TFmode)
18121 use_sse = true;
18122 else if (TARGET_SSE_MATH)
18124 use_sse = SSE_FLOAT_MODE_P (mode);
18125 if (mode == SFmode)
18126 vmode = V4SFmode;
18127 else if (mode == DFmode)
18128 vmode = V2DFmode;
18131 /* NEG and ABS performed with SSE use bitwise mask operations.
18132 Create the appropriate mask now. */
18133 if (use_sse)
18134 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
18135 else
18136 mask = NULL_RTX;
18138 dst = operands[0];
18139 src = operands[1];
18141 set = gen_rtx_fmt_e (code, mode, src);
18142 set = gen_rtx_SET (VOIDmode, dst, set);
18144 if (mask)
18146 rtx use, clob;
18147 rtvec par;
18149 use = gen_rtx_USE (VOIDmode, mask);
18150 if (vector_mode)
18151 par = gen_rtvec (2, set, use);
18152 else
18154 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
18155 par = gen_rtvec (3, set, use, clob);
18157 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
18159 else
18160 emit_insn (set);
18163 /* Expand a copysign operation. Special case operand 0 being a constant. */
18165 void
18166 ix86_expand_copysign (rtx operands[])
18168 enum machine_mode mode, vmode;
18169 rtx dest, op0, op1, mask, nmask;
18171 dest = operands[0];
18172 op0 = operands[1];
18173 op1 = operands[2];
18175 mode = GET_MODE (dest);
18177 if (mode == SFmode)
18178 vmode = V4SFmode;
18179 else if (mode == DFmode)
18180 vmode = V2DFmode;
18181 else
18182 vmode = mode;
18184 if (GET_CODE (op0) == CONST_DOUBLE)
18186 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
18188 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
18189 op0 = simplify_unary_operation (ABS, mode, op0, mode);
18191 if (mode == SFmode || mode == DFmode)
18193 if (op0 == CONST0_RTX (mode))
18194 op0 = CONST0_RTX (vmode);
18195 else
18197 rtx v = ix86_build_const_vector (vmode, false, op0);
18199 op0 = force_reg (vmode, v);
18202 else if (op0 != CONST0_RTX (mode))
18203 op0 = force_reg (mode, op0);
18205 mask = ix86_build_signbit_mask (vmode, 0, 0);
18207 if (mode == SFmode)
18208 copysign_insn = gen_copysignsf3_const;
18209 else if (mode == DFmode)
18210 copysign_insn = gen_copysigndf3_const;
18211 else
18212 copysign_insn = gen_copysigntf3_const;
18214 emit_insn (copysign_insn (dest, op0, op1, mask));
18216 else
18218 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
18220 nmask = ix86_build_signbit_mask (vmode, 0, 1);
18221 mask = ix86_build_signbit_mask (vmode, 0, 0);
18223 if (mode == SFmode)
18224 copysign_insn = gen_copysignsf3_var;
18225 else if (mode == DFmode)
18226 copysign_insn = gen_copysigndf3_var;
18227 else
18228 copysign_insn = gen_copysigntf3_var;
18230 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
18234 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
18235 be a constant, and so has already been expanded into a vector constant. */
18237 void
18238 ix86_split_copysign_const (rtx operands[])
18240 enum machine_mode mode, vmode;
18241 rtx dest, op0, mask, x;
18243 dest = operands[0];
18244 op0 = operands[1];
18245 mask = operands[3];
18247 mode = GET_MODE (dest);
18248 vmode = GET_MODE (mask);
18250 dest = simplify_gen_subreg (vmode, dest, mode, 0);
18251 x = gen_rtx_AND (vmode, dest, mask);
18252 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18254 if (op0 != CONST0_RTX (vmode))
18256 x = gen_rtx_IOR (vmode, dest, op0);
18257 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18261 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
18262 so we have to do two masks. */
18264 void
18265 ix86_split_copysign_var (rtx operands[])
18267 enum machine_mode mode, vmode;
18268 rtx dest, scratch, op0, op1, mask, nmask, x;
18270 dest = operands[0];
18271 scratch = operands[1];
18272 op0 = operands[2];
18273 op1 = operands[3];
18274 nmask = operands[4];
18275 mask = operands[5];
18277 mode = GET_MODE (dest);
18278 vmode = GET_MODE (mask);
18280 if (rtx_equal_p (op0, op1))
18282 /* Shouldn't happen often (it's useless, obviously), but when it does
18283 we'd generate incorrect code if we continue below. */
18284 emit_move_insn (dest, op0);
18285 return;
18288 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
18290 gcc_assert (REGNO (op1) == REGNO (scratch));
18292 x = gen_rtx_AND (vmode, scratch, mask);
18293 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18295 dest = mask;
18296 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18297 x = gen_rtx_NOT (vmode, dest);
18298 x = gen_rtx_AND (vmode, x, op0);
18299 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18301 else
18303 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
18305 x = gen_rtx_AND (vmode, scratch, mask);
18307 else /* alternative 2,4 */
18309 gcc_assert (REGNO (mask) == REGNO (scratch));
18310 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
18311 x = gen_rtx_AND (vmode, scratch, op1);
18313 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18315 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
18317 dest = simplify_gen_subreg (vmode, op0, mode, 0);
18318 x = gen_rtx_AND (vmode, dest, nmask);
18320 else /* alternative 3,4 */
18322 gcc_assert (REGNO (nmask) == REGNO (dest));
18323 dest = nmask;
18324 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18325 x = gen_rtx_AND (vmode, dest, op0);
18327 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18330 x = gen_rtx_IOR (vmode, dest, scratch);
18331 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18334 /* Return TRUE or FALSE depending on whether the first SET in INSN
18335 has source and destination with matching CC modes, and that the
18336 CC mode is at least as constrained as REQ_MODE. */
18338 bool
18339 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
18341 rtx set;
18342 enum machine_mode set_mode;
18344 set = PATTERN (insn);
18345 if (GET_CODE (set) == PARALLEL)
18346 set = XVECEXP (set, 0, 0);
18347 gcc_assert (GET_CODE (set) == SET);
18348 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
18350 set_mode = GET_MODE (SET_DEST (set));
18351 switch (set_mode)
18353 case CCNOmode:
18354 if (req_mode != CCNOmode
18355 && (req_mode != CCmode
18356 || XEXP (SET_SRC (set), 1) != const0_rtx))
18357 return false;
18358 break;
18359 case CCmode:
18360 if (req_mode == CCGCmode)
18361 return false;
18362 /* FALLTHRU */
18363 case CCGCmode:
18364 if (req_mode == CCGOCmode || req_mode == CCNOmode)
18365 return false;
18366 /* FALLTHRU */
18367 case CCGOCmode:
18368 if (req_mode == CCZmode)
18369 return false;
18370 /* FALLTHRU */
18371 case CCZmode:
18372 break;
18374 case CCAmode:
18375 case CCCmode:
18376 case CCOmode:
18377 case CCSmode:
18378 if (set_mode != req_mode)
18379 return false;
18380 break;
18382 default:
18383 gcc_unreachable ();
18386 return GET_MODE (SET_SRC (set)) == set_mode;
18389 /* Generate insn patterns to do an integer compare of OPERANDS. */
18391 static rtx
18392 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
18394 enum machine_mode cmpmode;
18395 rtx tmp, flags;
18397 cmpmode = SELECT_CC_MODE (code, op0, op1);
18398 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
18400 /* This is very simple, but making the interface the same as in the
18401 FP case makes the rest of the code easier. */
18402 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
18403 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
18405 /* Return the test that should be put into the flags user, i.e.
18406 the bcc, scc, or cmov instruction. */
18407 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
18410 /* Figure out whether to use ordered or unordered fp comparisons.
18411 Return the appropriate mode to use. */
18413 enum machine_mode
18414 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
18416 /* ??? In order to make all comparisons reversible, we do all comparisons
18417 non-trapping when compiling for IEEE. Once gcc is able to distinguish
18418 all forms trapping and nontrapping comparisons, we can make inequality
18419 comparisons trapping again, since it results in better code when using
18420 FCOM based compares. */
18421 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
18424 enum machine_mode
18425 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
18427 enum machine_mode mode = GET_MODE (op0);
18429 if (SCALAR_FLOAT_MODE_P (mode))
18431 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18432 return ix86_fp_compare_mode (code);
18435 switch (code)
18437 /* Only zero flag is needed. */
18438 case EQ: /* ZF=0 */
18439 case NE: /* ZF!=0 */
18440 return CCZmode;
18441 /* Codes needing carry flag. */
18442 case GEU: /* CF=0 */
18443 case LTU: /* CF=1 */
18444 /* Detect overflow checks. They need just the carry flag. */
18445 if (GET_CODE (op0) == PLUS
18446 && rtx_equal_p (op1, XEXP (op0, 0)))
18447 return CCCmode;
18448 else
18449 return CCmode;
18450 case GTU: /* CF=0 & ZF=0 */
18451 case LEU: /* CF=1 | ZF=1 */
18452 /* Detect overflow checks. They need just the carry flag. */
18453 if (GET_CODE (op0) == MINUS
18454 && rtx_equal_p (op1, XEXP (op0, 0)))
18455 return CCCmode;
18456 else
18457 return CCmode;
18458 /* Codes possibly doable only with sign flag when
18459 comparing against zero. */
18460 case GE: /* SF=OF or SF=0 */
18461 case LT: /* SF<>OF or SF=1 */
18462 if (op1 == const0_rtx)
18463 return CCGOCmode;
18464 else
18465 /* For other cases Carry flag is not required. */
18466 return CCGCmode;
18467 /* Codes doable only with sign flag when comparing
18468 against zero, but we miss jump instruction for it
18469 so we need to use relational tests against overflow
18470 that thus needs to be zero. */
18471 case GT: /* ZF=0 & SF=OF */
18472 case LE: /* ZF=1 | SF<>OF */
18473 if (op1 == const0_rtx)
18474 return CCNOmode;
18475 else
18476 return CCGCmode;
18477 /* strcmp pattern do (use flags) and combine may ask us for proper
18478 mode. */
18479 case USE:
18480 return CCmode;
18481 default:
18482 gcc_unreachable ();
18486 /* Return the fixed registers used for condition codes. */
18488 static bool
18489 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
18491 *p1 = FLAGS_REG;
18492 *p2 = FPSR_REG;
18493 return true;
18496 /* If two condition code modes are compatible, return a condition code
18497 mode which is compatible with both. Otherwise, return
18498 VOIDmode. */
18500 static enum machine_mode
18501 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
18503 if (m1 == m2)
18504 return m1;
18506 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
18507 return VOIDmode;
18509 if ((m1 == CCGCmode && m2 == CCGOCmode)
18510 || (m1 == CCGOCmode && m2 == CCGCmode))
18511 return CCGCmode;
18513 if (m1 == CCZmode && (m2 == CCGCmode || m2 == CCGOCmode))
18514 return m2;
18515 else if (m2 == CCZmode && (m1 == CCGCmode || m1 == CCGOCmode))
18516 return m1;
18518 switch (m1)
18520 default:
18521 gcc_unreachable ();
18523 case CCmode:
18524 case CCGCmode:
18525 case CCGOCmode:
18526 case CCNOmode:
18527 case CCAmode:
18528 case CCCmode:
18529 case CCOmode:
18530 case CCSmode:
18531 case CCZmode:
18532 switch (m2)
18534 default:
18535 return VOIDmode;
18537 case CCmode:
18538 case CCGCmode:
18539 case CCGOCmode:
18540 case CCNOmode:
18541 case CCAmode:
18542 case CCCmode:
18543 case CCOmode:
18544 case CCSmode:
18545 case CCZmode:
18546 return CCmode;
18549 case CCFPmode:
18550 case CCFPUmode:
18551 /* These are only compatible with themselves, which we already
18552 checked above. */
18553 return VOIDmode;
18558 /* Return a comparison we can do and that it is equivalent to
18559 swap_condition (code) apart possibly from orderedness.
18560 But, never change orderedness if TARGET_IEEE_FP, returning
18561 UNKNOWN in that case if necessary. */
18563 static enum rtx_code
18564 ix86_fp_swap_condition (enum rtx_code code)
18566 switch (code)
18568 case GT: /* GTU - CF=0 & ZF=0 */
18569 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
18570 case GE: /* GEU - CF=0 */
18571 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
18572 case UNLT: /* LTU - CF=1 */
18573 return TARGET_IEEE_FP ? UNKNOWN : GT;
18574 case UNLE: /* LEU - CF=1 | ZF=1 */
18575 return TARGET_IEEE_FP ? UNKNOWN : GE;
18576 default:
18577 return swap_condition (code);
18581 /* Return cost of comparison CODE using the best strategy for performance.
18582 All following functions do use number of instructions as a cost metrics.
18583 In future this should be tweaked to compute bytes for optimize_size and
18584 take into account performance of various instructions on various CPUs. */
18586 static int
18587 ix86_fp_comparison_cost (enum rtx_code code)
18589 int arith_cost;
18591 /* The cost of code using bit-twiddling on %ah. */
18592 switch (code)
18594 case UNLE:
18595 case UNLT:
18596 case LTGT:
18597 case GT:
18598 case GE:
18599 case UNORDERED:
18600 case ORDERED:
18601 case UNEQ:
18602 arith_cost = 4;
18603 break;
18604 case LT:
18605 case NE:
18606 case EQ:
18607 case UNGE:
18608 arith_cost = TARGET_IEEE_FP ? 5 : 4;
18609 break;
18610 case LE:
18611 case UNGT:
18612 arith_cost = TARGET_IEEE_FP ? 6 : 4;
18613 break;
18614 default:
18615 gcc_unreachable ();
18618 switch (ix86_fp_comparison_strategy (code))
18620 case IX86_FPCMP_COMI:
18621 return arith_cost > 4 ? 3 : 2;
18622 case IX86_FPCMP_SAHF:
18623 return arith_cost > 4 ? 4 : 3;
18624 default:
18625 return arith_cost;
18629 /* Return strategy to use for floating-point. We assume that fcomi is always
18630 preferrable where available, since that is also true when looking at size
18631 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
18633 enum ix86_fpcmp_strategy
18634 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
18636 /* Do fcomi/sahf based test when profitable. */
18638 if (TARGET_CMOVE)
18639 return IX86_FPCMP_COMI;
18641 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_function_for_size_p (cfun)))
18642 return IX86_FPCMP_SAHF;
18644 return IX86_FPCMP_ARITH;
18647 /* Swap, force into registers, or otherwise massage the two operands
18648 to a fp comparison. The operands are updated in place; the new
18649 comparison code is returned. */
18651 static enum rtx_code
18652 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
18654 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
18655 rtx op0 = *pop0, op1 = *pop1;
18656 enum machine_mode op_mode = GET_MODE (op0);
18657 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
18659 /* All of the unordered compare instructions only work on registers.
18660 The same is true of the fcomi compare instructions. The XFmode
18661 compare instructions require registers except when comparing
18662 against zero or when converting operand 1 from fixed point to
18663 floating point. */
18665 if (!is_sse
18666 && (fpcmp_mode == CCFPUmode
18667 || (op_mode == XFmode
18668 && ! (standard_80387_constant_p (op0) == 1
18669 || standard_80387_constant_p (op1) == 1)
18670 && GET_CODE (op1) != FLOAT)
18671 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
18673 op0 = force_reg (op_mode, op0);
18674 op1 = force_reg (op_mode, op1);
18676 else
18678 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
18679 things around if they appear profitable, otherwise force op0
18680 into a register. */
18682 if (standard_80387_constant_p (op0) == 0
18683 || (MEM_P (op0)
18684 && ! (standard_80387_constant_p (op1) == 0
18685 || MEM_P (op1))))
18687 enum rtx_code new_code = ix86_fp_swap_condition (code);
18688 if (new_code != UNKNOWN)
18690 rtx tmp;
18691 tmp = op0, op0 = op1, op1 = tmp;
18692 code = new_code;
18696 if (!REG_P (op0))
18697 op0 = force_reg (op_mode, op0);
18699 if (CONSTANT_P (op1))
18701 int tmp = standard_80387_constant_p (op1);
18702 if (tmp == 0)
18703 op1 = validize_mem (force_const_mem (op_mode, op1));
18704 else if (tmp == 1)
18706 if (TARGET_CMOVE)
18707 op1 = force_reg (op_mode, op1);
18709 else
18710 op1 = force_reg (op_mode, op1);
18714 /* Try to rearrange the comparison to make it cheaper. */
18715 if (ix86_fp_comparison_cost (code)
18716 > ix86_fp_comparison_cost (swap_condition (code))
18717 && (REG_P (op1) || can_create_pseudo_p ()))
18719 rtx tmp;
18720 tmp = op0, op0 = op1, op1 = tmp;
18721 code = swap_condition (code);
18722 if (!REG_P (op0))
18723 op0 = force_reg (op_mode, op0);
18726 *pop0 = op0;
18727 *pop1 = op1;
18728 return code;
18731 /* Convert comparison codes we use to represent FP comparison to integer
18732 code that will result in proper branch. Return UNKNOWN if no such code
18733 is available. */
18735 enum rtx_code
18736 ix86_fp_compare_code_to_integer (enum rtx_code code)
18738 switch (code)
18740 case GT:
18741 return GTU;
18742 case GE:
18743 return GEU;
18744 case ORDERED:
18745 case UNORDERED:
18746 return code;
18747 break;
18748 case UNEQ:
18749 return EQ;
18750 break;
18751 case UNLT:
18752 return LTU;
18753 break;
18754 case UNLE:
18755 return LEU;
18756 break;
18757 case LTGT:
18758 return NE;
18759 break;
18760 default:
18761 return UNKNOWN;
18765 /* Generate insn patterns to do a floating point compare of OPERANDS. */
18767 static rtx
18768 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
18770 enum machine_mode fpcmp_mode, intcmp_mode;
18771 rtx tmp, tmp2;
18773 fpcmp_mode = ix86_fp_compare_mode (code);
18774 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
18776 /* Do fcomi/sahf based test when profitable. */
18777 switch (ix86_fp_comparison_strategy (code))
18779 case IX86_FPCMP_COMI:
18780 intcmp_mode = fpcmp_mode;
18781 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18782 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18783 tmp);
18784 emit_insn (tmp);
18785 break;
18787 case IX86_FPCMP_SAHF:
18788 intcmp_mode = fpcmp_mode;
18789 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18790 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18791 tmp);
18793 if (!scratch)
18794 scratch = gen_reg_rtx (HImode);
18795 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
18796 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
18797 break;
18799 case IX86_FPCMP_ARITH:
18800 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
18801 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18802 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
18803 if (!scratch)
18804 scratch = gen_reg_rtx (HImode);
18805 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
18807 /* In the unordered case, we have to check C2 for NaN's, which
18808 doesn't happen to work out to anything nice combination-wise.
18809 So do some bit twiddling on the value we've got in AH to come
18810 up with an appropriate set of condition codes. */
18812 intcmp_mode = CCNOmode;
18813 switch (code)
18815 case GT:
18816 case UNGT:
18817 if (code == GT || !TARGET_IEEE_FP)
18819 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18820 code = EQ;
18822 else
18824 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18825 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18826 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
18827 intcmp_mode = CCmode;
18828 code = GEU;
18830 break;
18831 case LT:
18832 case UNLT:
18833 if (code == LT && TARGET_IEEE_FP)
18835 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18836 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
18837 intcmp_mode = CCmode;
18838 code = EQ;
18840 else
18842 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
18843 code = NE;
18845 break;
18846 case GE:
18847 case UNGE:
18848 if (code == GE || !TARGET_IEEE_FP)
18850 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
18851 code = EQ;
18853 else
18855 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18856 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
18857 code = NE;
18859 break;
18860 case LE:
18861 case UNLE:
18862 if (code == LE && TARGET_IEEE_FP)
18864 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18865 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18866 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18867 intcmp_mode = CCmode;
18868 code = LTU;
18870 else
18872 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18873 code = NE;
18875 break;
18876 case EQ:
18877 case UNEQ:
18878 if (code == EQ && TARGET_IEEE_FP)
18880 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18881 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18882 intcmp_mode = CCmode;
18883 code = EQ;
18885 else
18887 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18888 code = NE;
18890 break;
18891 case NE:
18892 case LTGT:
18893 if (code == NE && TARGET_IEEE_FP)
18895 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18896 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
18897 GEN_INT (0x40)));
18898 code = NE;
18900 else
18902 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18903 code = EQ;
18905 break;
18907 case UNORDERED:
18908 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18909 code = NE;
18910 break;
18911 case ORDERED:
18912 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18913 code = EQ;
18914 break;
18916 default:
18917 gcc_unreachable ();
18919 break;
18921 default:
18922 gcc_unreachable();
18925 /* Return the test that should be put into the flags user, i.e.
18926 the bcc, scc, or cmov instruction. */
18927 return gen_rtx_fmt_ee (code, VOIDmode,
18928 gen_rtx_REG (intcmp_mode, FLAGS_REG),
18929 const0_rtx);
18932 static rtx
18933 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
18935 rtx ret;
18937 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
18938 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
18940 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
18942 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
18943 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18945 else
18946 ret = ix86_expand_int_compare (code, op0, op1);
18948 return ret;
18951 void
18952 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
18954 enum machine_mode mode = GET_MODE (op0);
18955 rtx tmp;
18957 switch (mode)
18959 case SFmode:
18960 case DFmode:
18961 case XFmode:
18962 case QImode:
18963 case HImode:
18964 case SImode:
18965 simple:
18966 tmp = ix86_expand_compare (code, op0, op1);
18967 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
18968 gen_rtx_LABEL_REF (VOIDmode, label),
18969 pc_rtx);
18970 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
18971 return;
18973 case DImode:
18974 if (TARGET_64BIT)
18975 goto simple;
18976 case TImode:
18977 /* Expand DImode branch into multiple compare+branch. */
18979 rtx lo[2], hi[2], label2;
18980 enum rtx_code code1, code2, code3;
18981 enum machine_mode submode;
18983 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
18985 tmp = op0, op0 = op1, op1 = tmp;
18986 code = swap_condition (code);
18989 split_double_mode (mode, &op0, 1, lo+0, hi+0);
18990 split_double_mode (mode, &op1, 1, lo+1, hi+1);
18992 submode = mode == DImode ? SImode : DImode;
18994 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
18995 avoid two branches. This costs one extra insn, so disable when
18996 optimizing for size. */
18998 if ((code == EQ || code == NE)
18999 && (!optimize_insn_for_size_p ()
19000 || hi[1] == const0_rtx || lo[1] == const0_rtx))
19002 rtx xor0, xor1;
19004 xor1 = hi[0];
19005 if (hi[1] != const0_rtx)
19006 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
19007 NULL_RTX, 0, OPTAB_WIDEN);
19009 xor0 = lo[0];
19010 if (lo[1] != const0_rtx)
19011 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
19012 NULL_RTX, 0, OPTAB_WIDEN);
19014 tmp = expand_binop (submode, ior_optab, xor1, xor0,
19015 NULL_RTX, 0, OPTAB_WIDEN);
19017 ix86_expand_branch (code, tmp, const0_rtx, label);
19018 return;
19021 /* Otherwise, if we are doing less-than or greater-or-equal-than,
19022 op1 is a constant and the low word is zero, then we can just
19023 examine the high word. Similarly for low word -1 and
19024 less-or-equal-than or greater-than. */
19026 if (CONST_INT_P (hi[1]))
19027 switch (code)
19029 case LT: case LTU: case GE: case GEU:
19030 if (lo[1] == const0_rtx)
19032 ix86_expand_branch (code, hi[0], hi[1], label);
19033 return;
19035 break;
19036 case LE: case LEU: case GT: case GTU:
19037 if (lo[1] == constm1_rtx)
19039 ix86_expand_branch (code, hi[0], hi[1], label);
19040 return;
19042 break;
19043 default:
19044 break;
19047 /* Otherwise, we need two or three jumps. */
19049 label2 = gen_label_rtx ();
19051 code1 = code;
19052 code2 = swap_condition (code);
19053 code3 = unsigned_condition (code);
19055 switch (code)
19057 case LT: case GT: case LTU: case GTU:
19058 break;
19060 case LE: code1 = LT; code2 = GT; break;
19061 case GE: code1 = GT; code2 = LT; break;
19062 case LEU: code1 = LTU; code2 = GTU; break;
19063 case GEU: code1 = GTU; code2 = LTU; break;
19065 case EQ: code1 = UNKNOWN; code2 = NE; break;
19066 case NE: code2 = UNKNOWN; break;
19068 default:
19069 gcc_unreachable ();
19073 * a < b =>
19074 * if (hi(a) < hi(b)) goto true;
19075 * if (hi(a) > hi(b)) goto false;
19076 * if (lo(a) < lo(b)) goto true;
19077 * false:
19080 if (code1 != UNKNOWN)
19081 ix86_expand_branch (code1, hi[0], hi[1], label);
19082 if (code2 != UNKNOWN)
19083 ix86_expand_branch (code2, hi[0], hi[1], label2);
19085 ix86_expand_branch (code3, lo[0], lo[1], label);
19087 if (code2 != UNKNOWN)
19088 emit_label (label2);
19089 return;
19092 default:
19093 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
19094 goto simple;
19098 /* Split branch based on floating point condition. */
19099 void
19100 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
19101 rtx target1, rtx target2, rtx tmp, rtx pushed)
19103 rtx condition;
19104 rtx i;
19106 if (target2 != pc_rtx)
19108 rtx tmp = target2;
19109 code = reverse_condition_maybe_unordered (code);
19110 target2 = target1;
19111 target1 = tmp;
19114 condition = ix86_expand_fp_compare (code, op1, op2,
19115 tmp);
19117 /* Remove pushed operand from stack. */
19118 if (pushed)
19119 ix86_free_from_memory (GET_MODE (pushed));
19121 i = emit_jump_insn (gen_rtx_SET
19122 (VOIDmode, pc_rtx,
19123 gen_rtx_IF_THEN_ELSE (VOIDmode,
19124 condition, target1, target2)));
19125 if (split_branch_probability >= 0)
19126 add_reg_note (i, REG_BR_PROB, GEN_INT (split_branch_probability));
19129 void
19130 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
19132 rtx ret;
19134 gcc_assert (GET_MODE (dest) == QImode);
19136 ret = ix86_expand_compare (code, op0, op1);
19137 PUT_MODE (ret, QImode);
19138 emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
19141 /* Expand comparison setting or clearing carry flag. Return true when
19142 successful and set pop for the operation. */
19143 static bool
19144 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
19146 enum machine_mode mode =
19147 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
19149 /* Do not handle double-mode compares that go through special path. */
19150 if (mode == (TARGET_64BIT ? TImode : DImode))
19151 return false;
19153 if (SCALAR_FLOAT_MODE_P (mode))
19155 rtx compare_op, compare_seq;
19157 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
19159 /* Shortcut: following common codes never translate
19160 into carry flag compares. */
19161 if (code == EQ || code == NE || code == UNEQ || code == LTGT
19162 || code == ORDERED || code == UNORDERED)
19163 return false;
19165 /* These comparisons require zero flag; swap operands so they won't. */
19166 if ((code == GT || code == UNLE || code == LE || code == UNGT)
19167 && !TARGET_IEEE_FP)
19169 rtx tmp = op0;
19170 op0 = op1;
19171 op1 = tmp;
19172 code = swap_condition (code);
19175 /* Try to expand the comparison and verify that we end up with
19176 carry flag based comparison. This fails to be true only when
19177 we decide to expand comparison using arithmetic that is not
19178 too common scenario. */
19179 start_sequence ();
19180 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
19181 compare_seq = get_insns ();
19182 end_sequence ();
19184 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
19185 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
19186 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
19187 else
19188 code = GET_CODE (compare_op);
19190 if (code != LTU && code != GEU)
19191 return false;
19193 emit_insn (compare_seq);
19194 *pop = compare_op;
19195 return true;
19198 if (!INTEGRAL_MODE_P (mode))
19199 return false;
19201 switch (code)
19203 case LTU:
19204 case GEU:
19205 break;
19207 /* Convert a==0 into (unsigned)a<1. */
19208 case EQ:
19209 case NE:
19210 if (op1 != const0_rtx)
19211 return false;
19212 op1 = const1_rtx;
19213 code = (code == EQ ? LTU : GEU);
19214 break;
19216 /* Convert a>b into b<a or a>=b-1. */
19217 case GTU:
19218 case LEU:
19219 if (CONST_INT_P (op1))
19221 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
19222 /* Bail out on overflow. We still can swap operands but that
19223 would force loading of the constant into register. */
19224 if (op1 == const0_rtx
19225 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
19226 return false;
19227 code = (code == GTU ? GEU : LTU);
19229 else
19231 rtx tmp = op1;
19232 op1 = op0;
19233 op0 = tmp;
19234 code = (code == GTU ? LTU : GEU);
19236 break;
19238 /* Convert a>=0 into (unsigned)a<0x80000000. */
19239 case LT:
19240 case GE:
19241 if (mode == DImode || op1 != const0_rtx)
19242 return false;
19243 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19244 code = (code == LT ? GEU : LTU);
19245 break;
19246 case LE:
19247 case GT:
19248 if (mode == DImode || op1 != constm1_rtx)
19249 return false;
19250 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19251 code = (code == LE ? GEU : LTU);
19252 break;
19254 default:
19255 return false;
19257 /* Swapping operands may cause constant to appear as first operand. */
19258 if (!nonimmediate_operand (op0, VOIDmode))
19260 if (!can_create_pseudo_p ())
19261 return false;
19262 op0 = force_reg (mode, op0);
19264 *pop = ix86_expand_compare (code, op0, op1);
19265 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
19266 return true;
19269 bool
19270 ix86_expand_int_movcc (rtx operands[])
19272 enum rtx_code code = GET_CODE (operands[1]), compare_code;
19273 rtx compare_seq, compare_op;
19274 enum machine_mode mode = GET_MODE (operands[0]);
19275 bool sign_bit_compare_p = false;
19276 rtx op0 = XEXP (operands[1], 0);
19277 rtx op1 = XEXP (operands[1], 1);
19279 if (GET_MODE (op0) == TImode
19280 || (GET_MODE (op0) == DImode
19281 && !TARGET_64BIT))
19282 return false;
19284 start_sequence ();
19285 compare_op = ix86_expand_compare (code, op0, op1);
19286 compare_seq = get_insns ();
19287 end_sequence ();
19289 compare_code = GET_CODE (compare_op);
19291 if ((op1 == const0_rtx && (code == GE || code == LT))
19292 || (op1 == constm1_rtx && (code == GT || code == LE)))
19293 sign_bit_compare_p = true;
19295 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
19296 HImode insns, we'd be swallowed in word prefix ops. */
19298 if ((mode != HImode || TARGET_FAST_PREFIX)
19299 && (mode != (TARGET_64BIT ? TImode : DImode))
19300 && CONST_INT_P (operands[2])
19301 && CONST_INT_P (operands[3]))
19303 rtx out = operands[0];
19304 HOST_WIDE_INT ct = INTVAL (operands[2]);
19305 HOST_WIDE_INT cf = INTVAL (operands[3]);
19306 HOST_WIDE_INT diff;
19308 diff = ct - cf;
19309 /* Sign bit compares are better done using shifts than we do by using
19310 sbb. */
19311 if (sign_bit_compare_p
19312 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
19314 /* Detect overlap between destination and compare sources. */
19315 rtx tmp = out;
19317 if (!sign_bit_compare_p)
19319 rtx flags;
19320 bool fpcmp = false;
19322 compare_code = GET_CODE (compare_op);
19324 flags = XEXP (compare_op, 0);
19326 if (GET_MODE (flags) == CCFPmode
19327 || GET_MODE (flags) == CCFPUmode)
19329 fpcmp = true;
19330 compare_code
19331 = ix86_fp_compare_code_to_integer (compare_code);
19334 /* To simplify rest of code, restrict to the GEU case. */
19335 if (compare_code == LTU)
19337 HOST_WIDE_INT tmp = ct;
19338 ct = cf;
19339 cf = tmp;
19340 compare_code = reverse_condition (compare_code);
19341 code = reverse_condition (code);
19343 else
19345 if (fpcmp)
19346 PUT_CODE (compare_op,
19347 reverse_condition_maybe_unordered
19348 (GET_CODE (compare_op)));
19349 else
19350 PUT_CODE (compare_op,
19351 reverse_condition (GET_CODE (compare_op)));
19353 diff = ct - cf;
19355 if (reg_overlap_mentioned_p (out, op0)
19356 || reg_overlap_mentioned_p (out, op1))
19357 tmp = gen_reg_rtx (mode);
19359 if (mode == DImode)
19360 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
19361 else
19362 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
19363 flags, compare_op));
19365 else
19367 if (code == GT || code == GE)
19368 code = reverse_condition (code);
19369 else
19371 HOST_WIDE_INT tmp = ct;
19372 ct = cf;
19373 cf = tmp;
19374 diff = ct - cf;
19376 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
19379 if (diff == 1)
19382 * cmpl op0,op1
19383 * sbbl dest,dest
19384 * [addl dest, ct]
19386 * Size 5 - 8.
19388 if (ct)
19389 tmp = expand_simple_binop (mode, PLUS,
19390 tmp, GEN_INT (ct),
19391 copy_rtx (tmp), 1, OPTAB_DIRECT);
19393 else if (cf == -1)
19396 * cmpl op0,op1
19397 * sbbl dest,dest
19398 * orl $ct, dest
19400 * Size 8.
19402 tmp = expand_simple_binop (mode, IOR,
19403 tmp, GEN_INT (ct),
19404 copy_rtx (tmp), 1, OPTAB_DIRECT);
19406 else if (diff == -1 && ct)
19409 * cmpl op0,op1
19410 * sbbl dest,dest
19411 * notl dest
19412 * [addl dest, cf]
19414 * Size 8 - 11.
19416 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19417 if (cf)
19418 tmp = expand_simple_binop (mode, PLUS,
19419 copy_rtx (tmp), GEN_INT (cf),
19420 copy_rtx (tmp), 1, OPTAB_DIRECT);
19422 else
19425 * cmpl op0,op1
19426 * sbbl dest,dest
19427 * [notl dest]
19428 * andl cf - ct, dest
19429 * [addl dest, ct]
19431 * Size 8 - 11.
19434 if (cf == 0)
19436 cf = ct;
19437 ct = 0;
19438 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19441 tmp = expand_simple_binop (mode, AND,
19442 copy_rtx (tmp),
19443 gen_int_mode (cf - ct, mode),
19444 copy_rtx (tmp), 1, OPTAB_DIRECT);
19445 if (ct)
19446 tmp = expand_simple_binop (mode, PLUS,
19447 copy_rtx (tmp), GEN_INT (ct),
19448 copy_rtx (tmp), 1, OPTAB_DIRECT);
19451 if (!rtx_equal_p (tmp, out))
19452 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
19454 return true;
19457 if (diff < 0)
19459 enum machine_mode cmp_mode = GET_MODE (op0);
19461 HOST_WIDE_INT tmp;
19462 tmp = ct, ct = cf, cf = tmp;
19463 diff = -diff;
19465 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19467 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19469 /* We may be reversing unordered compare to normal compare, that
19470 is not valid in general (we may convert non-trapping condition
19471 to trapping one), however on i386 we currently emit all
19472 comparisons unordered. */
19473 compare_code = reverse_condition_maybe_unordered (compare_code);
19474 code = reverse_condition_maybe_unordered (code);
19476 else
19478 compare_code = reverse_condition (compare_code);
19479 code = reverse_condition (code);
19483 compare_code = UNKNOWN;
19484 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
19485 && CONST_INT_P (op1))
19487 if (op1 == const0_rtx
19488 && (code == LT || code == GE))
19489 compare_code = code;
19490 else if (op1 == constm1_rtx)
19492 if (code == LE)
19493 compare_code = LT;
19494 else if (code == GT)
19495 compare_code = GE;
19499 /* Optimize dest = (op0 < 0) ? -1 : cf. */
19500 if (compare_code != UNKNOWN
19501 && GET_MODE (op0) == GET_MODE (out)
19502 && (cf == -1 || ct == -1))
19504 /* If lea code below could be used, only optimize
19505 if it results in a 2 insn sequence. */
19507 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
19508 || diff == 3 || diff == 5 || diff == 9)
19509 || (compare_code == LT && ct == -1)
19510 || (compare_code == GE && cf == -1))
19513 * notl op1 (if necessary)
19514 * sarl $31, op1
19515 * orl cf, op1
19517 if (ct != -1)
19519 cf = ct;
19520 ct = -1;
19521 code = reverse_condition (code);
19524 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19526 out = expand_simple_binop (mode, IOR,
19527 out, GEN_INT (cf),
19528 out, 1, OPTAB_DIRECT);
19529 if (out != operands[0])
19530 emit_move_insn (operands[0], out);
19532 return true;
19537 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
19538 || diff == 3 || diff == 5 || diff == 9)
19539 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
19540 && (mode != DImode
19541 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
19544 * xorl dest,dest
19545 * cmpl op1,op2
19546 * setcc dest
19547 * lea cf(dest*(ct-cf)),dest
19549 * Size 14.
19551 * This also catches the degenerate setcc-only case.
19554 rtx tmp;
19555 int nops;
19557 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19559 nops = 0;
19560 /* On x86_64 the lea instruction operates on Pmode, so we need
19561 to get arithmetics done in proper mode to match. */
19562 if (diff == 1)
19563 tmp = copy_rtx (out);
19564 else
19566 rtx out1;
19567 out1 = copy_rtx (out);
19568 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
19569 nops++;
19570 if (diff & 1)
19572 tmp = gen_rtx_PLUS (mode, tmp, out1);
19573 nops++;
19576 if (cf != 0)
19578 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
19579 nops++;
19581 if (!rtx_equal_p (tmp, out))
19583 if (nops == 1)
19584 out = force_operand (tmp, copy_rtx (out));
19585 else
19586 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
19588 if (!rtx_equal_p (out, operands[0]))
19589 emit_move_insn (operands[0], copy_rtx (out));
19591 return true;
19595 * General case: Jumpful:
19596 * xorl dest,dest cmpl op1, op2
19597 * cmpl op1, op2 movl ct, dest
19598 * setcc dest jcc 1f
19599 * decl dest movl cf, dest
19600 * andl (cf-ct),dest 1:
19601 * addl ct,dest
19603 * Size 20. Size 14.
19605 * This is reasonably steep, but branch mispredict costs are
19606 * high on modern cpus, so consider failing only if optimizing
19607 * for space.
19610 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19611 && BRANCH_COST (optimize_insn_for_speed_p (),
19612 false) >= 2)
19614 if (cf == 0)
19616 enum machine_mode cmp_mode = GET_MODE (op0);
19618 cf = ct;
19619 ct = 0;
19621 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19623 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19625 /* We may be reversing unordered compare to normal compare,
19626 that is not valid in general (we may convert non-trapping
19627 condition to trapping one), however on i386 we currently
19628 emit all comparisons unordered. */
19629 code = reverse_condition_maybe_unordered (code);
19631 else
19633 code = reverse_condition (code);
19634 if (compare_code != UNKNOWN)
19635 compare_code = reverse_condition (compare_code);
19639 if (compare_code != UNKNOWN)
19641 /* notl op1 (if needed)
19642 sarl $31, op1
19643 andl (cf-ct), op1
19644 addl ct, op1
19646 For x < 0 (resp. x <= -1) there will be no notl,
19647 so if possible swap the constants to get rid of the
19648 complement.
19649 True/false will be -1/0 while code below (store flag
19650 followed by decrement) is 0/-1, so the constants need
19651 to be exchanged once more. */
19653 if (compare_code == GE || !cf)
19655 code = reverse_condition (code);
19656 compare_code = LT;
19658 else
19660 HOST_WIDE_INT tmp = cf;
19661 cf = ct;
19662 ct = tmp;
19665 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19667 else
19669 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19671 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
19672 constm1_rtx,
19673 copy_rtx (out), 1, OPTAB_DIRECT);
19676 out = expand_simple_binop (mode, AND, copy_rtx (out),
19677 gen_int_mode (cf - ct, mode),
19678 copy_rtx (out), 1, OPTAB_DIRECT);
19679 if (ct)
19680 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
19681 copy_rtx (out), 1, OPTAB_DIRECT);
19682 if (!rtx_equal_p (out, operands[0]))
19683 emit_move_insn (operands[0], copy_rtx (out));
19685 return true;
19689 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19691 /* Try a few things more with specific constants and a variable. */
19693 optab op;
19694 rtx var, orig_out, out, tmp;
19696 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
19697 return false;
19699 /* If one of the two operands is an interesting constant, load a
19700 constant with the above and mask it in with a logical operation. */
19702 if (CONST_INT_P (operands[2]))
19704 var = operands[3];
19705 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
19706 operands[3] = constm1_rtx, op = and_optab;
19707 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
19708 operands[3] = const0_rtx, op = ior_optab;
19709 else
19710 return false;
19712 else if (CONST_INT_P (operands[3]))
19714 var = operands[2];
19715 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
19716 operands[2] = constm1_rtx, op = and_optab;
19717 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
19718 operands[2] = const0_rtx, op = ior_optab;
19719 else
19720 return false;
19722 else
19723 return false;
19725 orig_out = operands[0];
19726 tmp = gen_reg_rtx (mode);
19727 operands[0] = tmp;
19729 /* Recurse to get the constant loaded. */
19730 if (ix86_expand_int_movcc (operands) == 0)
19731 return false;
19733 /* Mask in the interesting variable. */
19734 out = expand_binop (mode, op, var, tmp, orig_out, 0,
19735 OPTAB_WIDEN);
19736 if (!rtx_equal_p (out, orig_out))
19737 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
19739 return true;
19743 * For comparison with above,
19745 * movl cf,dest
19746 * movl ct,tmp
19747 * cmpl op1,op2
19748 * cmovcc tmp,dest
19750 * Size 15.
19753 if (! nonimmediate_operand (operands[2], mode))
19754 operands[2] = force_reg (mode, operands[2]);
19755 if (! nonimmediate_operand (operands[3], mode))
19756 operands[3] = force_reg (mode, operands[3]);
19758 if (! register_operand (operands[2], VOIDmode)
19759 && (mode == QImode
19760 || ! register_operand (operands[3], VOIDmode)))
19761 operands[2] = force_reg (mode, operands[2]);
19763 if (mode == QImode
19764 && ! register_operand (operands[3], VOIDmode))
19765 operands[3] = force_reg (mode, operands[3]);
19767 emit_insn (compare_seq);
19768 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19769 gen_rtx_IF_THEN_ELSE (mode,
19770 compare_op, operands[2],
19771 operands[3])));
19772 return true;
19775 /* Swap, force into registers, or otherwise massage the two operands
19776 to an sse comparison with a mask result. Thus we differ a bit from
19777 ix86_prepare_fp_compare_args which expects to produce a flags result.
19779 The DEST operand exists to help determine whether to commute commutative
19780 operators. The POP0/POP1 operands are updated in place. The new
19781 comparison code is returned, or UNKNOWN if not implementable. */
19783 static enum rtx_code
19784 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
19785 rtx *pop0, rtx *pop1)
19787 rtx tmp;
19789 switch (code)
19791 case LTGT:
19792 case UNEQ:
19793 /* AVX supports all the needed comparisons. */
19794 if (TARGET_AVX)
19795 break;
19796 /* We have no LTGT as an operator. We could implement it with
19797 NE & ORDERED, but this requires an extra temporary. It's
19798 not clear that it's worth it. */
19799 return UNKNOWN;
19801 case LT:
19802 case LE:
19803 case UNGT:
19804 case UNGE:
19805 /* These are supported directly. */
19806 break;
19808 case EQ:
19809 case NE:
19810 case UNORDERED:
19811 case ORDERED:
19812 /* AVX has 3 operand comparisons, no need to swap anything. */
19813 if (TARGET_AVX)
19814 break;
19815 /* For commutative operators, try to canonicalize the destination
19816 operand to be first in the comparison - this helps reload to
19817 avoid extra moves. */
19818 if (!dest || !rtx_equal_p (dest, *pop1))
19819 break;
19820 /* FALLTHRU */
19822 case GE:
19823 case GT:
19824 case UNLE:
19825 case UNLT:
19826 /* These are not supported directly before AVX, and furthermore
19827 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
19828 comparison operands to transform into something that is
19829 supported. */
19830 tmp = *pop0;
19831 *pop0 = *pop1;
19832 *pop1 = tmp;
19833 code = swap_condition (code);
19834 break;
19836 default:
19837 gcc_unreachable ();
19840 return code;
19843 /* Detect conditional moves that exactly match min/max operational
19844 semantics. Note that this is IEEE safe, as long as we don't
19845 interchange the operands.
19847 Returns FALSE if this conditional move doesn't match a MIN/MAX,
19848 and TRUE if the operation is successful and instructions are emitted. */
19850 static bool
19851 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
19852 rtx cmp_op1, rtx if_true, rtx if_false)
19854 enum machine_mode mode;
19855 bool is_min;
19856 rtx tmp;
19858 if (code == LT)
19860 else if (code == UNGE)
19862 tmp = if_true;
19863 if_true = if_false;
19864 if_false = tmp;
19866 else
19867 return false;
19869 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
19870 is_min = true;
19871 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
19872 is_min = false;
19873 else
19874 return false;
19876 mode = GET_MODE (dest);
19878 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
19879 but MODE may be a vector mode and thus not appropriate. */
19880 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
19882 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
19883 rtvec v;
19885 if_true = force_reg (mode, if_true);
19886 v = gen_rtvec (2, if_true, if_false);
19887 tmp = gen_rtx_UNSPEC (mode, v, u);
19889 else
19891 code = is_min ? SMIN : SMAX;
19892 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
19895 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
19896 return true;
19899 /* Expand an sse vector comparison. Return the register with the result. */
19901 static rtx
19902 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
19903 rtx op_true, rtx op_false)
19905 enum machine_mode mode = GET_MODE (dest);
19906 enum machine_mode cmp_mode = GET_MODE (cmp_op0);
19907 rtx x;
19909 cmp_op0 = force_reg (cmp_mode, cmp_op0);
19910 if (!nonimmediate_operand (cmp_op1, cmp_mode))
19911 cmp_op1 = force_reg (cmp_mode, cmp_op1);
19913 if (optimize
19914 || reg_overlap_mentioned_p (dest, op_true)
19915 || reg_overlap_mentioned_p (dest, op_false))
19916 dest = gen_reg_rtx (mode);
19918 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
19919 if (cmp_mode != mode)
19921 x = force_reg (cmp_mode, x);
19922 convert_move (dest, x, false);
19924 else
19925 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19927 return dest;
19930 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
19931 operations. This is used for both scalar and vector conditional moves. */
19933 static void
19934 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
19936 enum machine_mode mode = GET_MODE (dest);
19937 rtx t2, t3, x;
19939 if (vector_all_ones_operand (op_true, mode)
19940 && rtx_equal_p (op_false, CONST0_RTX (mode)))
19942 emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
19944 else if (op_false == CONST0_RTX (mode))
19946 op_true = force_reg (mode, op_true);
19947 x = gen_rtx_AND (mode, cmp, op_true);
19948 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19950 else if (op_true == CONST0_RTX (mode))
19952 op_false = force_reg (mode, op_false);
19953 x = gen_rtx_NOT (mode, cmp);
19954 x = gen_rtx_AND (mode, x, op_false);
19955 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19957 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
19959 op_false = force_reg (mode, op_false);
19960 x = gen_rtx_IOR (mode, cmp, op_false);
19961 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19963 else if (TARGET_XOP)
19965 op_true = force_reg (mode, op_true);
19967 if (!nonimmediate_operand (op_false, mode))
19968 op_false = force_reg (mode, op_false);
19970 emit_insn (gen_rtx_SET (mode, dest,
19971 gen_rtx_IF_THEN_ELSE (mode, cmp,
19972 op_true,
19973 op_false)));
19975 else
19977 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
19979 if (!nonimmediate_operand (op_true, mode))
19980 op_true = force_reg (mode, op_true);
19982 op_false = force_reg (mode, op_false);
19984 switch (mode)
19986 case V4SFmode:
19987 if (TARGET_SSE4_1)
19988 gen = gen_sse4_1_blendvps;
19989 break;
19990 case V2DFmode:
19991 if (TARGET_SSE4_1)
19992 gen = gen_sse4_1_blendvpd;
19993 break;
19994 case V16QImode:
19995 case V8HImode:
19996 case V4SImode:
19997 case V2DImode:
19998 if (TARGET_SSE4_1)
20000 gen = gen_sse4_1_pblendvb;
20001 dest = gen_lowpart (V16QImode, dest);
20002 op_false = gen_lowpart (V16QImode, op_false);
20003 op_true = gen_lowpart (V16QImode, op_true);
20004 cmp = gen_lowpart (V16QImode, cmp);
20006 break;
20007 case V8SFmode:
20008 if (TARGET_AVX)
20009 gen = gen_avx_blendvps256;
20010 break;
20011 case V4DFmode:
20012 if (TARGET_AVX)
20013 gen = gen_avx_blendvpd256;
20014 break;
20015 case V32QImode:
20016 case V16HImode:
20017 case V8SImode:
20018 case V4DImode:
20019 if (TARGET_AVX2)
20021 gen = gen_avx2_pblendvb;
20022 dest = gen_lowpart (V32QImode, dest);
20023 op_false = gen_lowpart (V32QImode, op_false);
20024 op_true = gen_lowpart (V32QImode, op_true);
20025 cmp = gen_lowpart (V32QImode, cmp);
20027 break;
20028 default:
20029 break;
20032 if (gen != NULL)
20033 emit_insn (gen (dest, op_false, op_true, cmp));
20034 else
20036 op_true = force_reg (mode, op_true);
20038 t2 = gen_reg_rtx (mode);
20039 if (optimize)
20040 t3 = gen_reg_rtx (mode);
20041 else
20042 t3 = dest;
20044 x = gen_rtx_AND (mode, op_true, cmp);
20045 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
20047 x = gen_rtx_NOT (mode, cmp);
20048 x = gen_rtx_AND (mode, x, op_false);
20049 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
20051 x = gen_rtx_IOR (mode, t3, t2);
20052 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20057 /* Expand a floating-point conditional move. Return true if successful. */
20059 bool
20060 ix86_expand_fp_movcc (rtx operands[])
20062 enum machine_mode mode = GET_MODE (operands[0]);
20063 enum rtx_code code = GET_CODE (operands[1]);
20064 rtx tmp, compare_op;
20065 rtx op0 = XEXP (operands[1], 0);
20066 rtx op1 = XEXP (operands[1], 1);
20068 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
20070 enum machine_mode cmode;
20072 /* Since we've no cmove for sse registers, don't force bad register
20073 allocation just to gain access to it. Deny movcc when the
20074 comparison mode doesn't match the move mode. */
20075 cmode = GET_MODE (op0);
20076 if (cmode == VOIDmode)
20077 cmode = GET_MODE (op1);
20078 if (cmode != mode)
20079 return false;
20081 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
20082 if (code == UNKNOWN)
20083 return false;
20085 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
20086 operands[2], operands[3]))
20087 return true;
20089 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
20090 operands[2], operands[3]);
20091 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
20092 return true;
20095 if (GET_MODE (op0) == TImode
20096 || (GET_MODE (op0) == DImode
20097 && !TARGET_64BIT))
20098 return false;
20100 /* The floating point conditional move instructions don't directly
20101 support conditions resulting from a signed integer comparison. */
20103 compare_op = ix86_expand_compare (code, op0, op1);
20104 if (!fcmov_comparison_operator (compare_op, VOIDmode))
20106 tmp = gen_reg_rtx (QImode);
20107 ix86_expand_setcc (tmp, code, op0, op1);
20109 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
20112 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
20113 gen_rtx_IF_THEN_ELSE (mode, compare_op,
20114 operands[2], operands[3])));
20116 return true;
20119 /* Expand a floating-point vector conditional move; a vcond operation
20120 rather than a movcc operation. */
20122 bool
20123 ix86_expand_fp_vcond (rtx operands[])
20125 enum rtx_code code = GET_CODE (operands[3]);
20126 rtx cmp;
20128 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
20129 &operands[4], &operands[5]);
20130 if (code == UNKNOWN)
20132 rtx temp;
20133 switch (GET_CODE (operands[3]))
20135 case LTGT:
20136 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
20137 operands[5], operands[0], operands[0]);
20138 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
20139 operands[5], operands[1], operands[2]);
20140 code = AND;
20141 break;
20142 case UNEQ:
20143 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
20144 operands[5], operands[0], operands[0]);
20145 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
20146 operands[5], operands[1], operands[2]);
20147 code = IOR;
20148 break;
20149 default:
20150 gcc_unreachable ();
20152 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
20153 OPTAB_DIRECT);
20154 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20155 return true;
20158 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
20159 operands[5], operands[1], operands[2]))
20160 return true;
20162 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
20163 operands[1], operands[2]);
20164 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20165 return true;
20168 /* Expand a signed/unsigned integral vector conditional move. */
20170 bool
20171 ix86_expand_int_vcond (rtx operands[])
20173 enum machine_mode data_mode = GET_MODE (operands[0]);
20174 enum machine_mode mode = GET_MODE (operands[4]);
20175 enum rtx_code code = GET_CODE (operands[3]);
20176 bool negate = false;
20177 rtx x, cop0, cop1;
20179 cop0 = operands[4];
20180 cop1 = operands[5];
20182 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
20183 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
20184 if ((code == LT || code == GE)
20185 && data_mode == mode
20186 && cop1 == CONST0_RTX (mode)
20187 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
20188 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
20189 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
20190 && (GET_MODE_SIZE (data_mode) == 16
20191 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
20193 rtx negop = operands[2 - (code == LT)];
20194 int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
20195 if (negop == CONST1_RTX (data_mode))
20197 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
20198 operands[0], 1, OPTAB_DIRECT);
20199 if (res != operands[0])
20200 emit_move_insn (operands[0], res);
20201 return true;
20203 else if (GET_MODE_INNER (data_mode) != DImode
20204 && vector_all_ones_operand (negop, data_mode))
20206 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
20207 operands[0], 0, OPTAB_DIRECT);
20208 if (res != operands[0])
20209 emit_move_insn (operands[0], res);
20210 return true;
20214 if (!nonimmediate_operand (cop1, mode))
20215 cop1 = force_reg (mode, cop1);
20216 if (!general_operand (operands[1], data_mode))
20217 operands[1] = force_reg (data_mode, operands[1]);
20218 if (!general_operand (operands[2], data_mode))
20219 operands[2] = force_reg (data_mode, operands[2]);
20221 /* XOP supports all of the comparisons on all 128-bit vector int types. */
20222 if (TARGET_XOP
20223 && (mode == V16QImode || mode == V8HImode
20224 || mode == V4SImode || mode == V2DImode))
20226 else
20228 /* Canonicalize the comparison to EQ, GT, GTU. */
20229 switch (code)
20231 case EQ:
20232 case GT:
20233 case GTU:
20234 break;
20236 case NE:
20237 case LE:
20238 case LEU:
20239 code = reverse_condition (code);
20240 negate = true;
20241 break;
20243 case GE:
20244 case GEU:
20245 code = reverse_condition (code);
20246 negate = true;
20247 /* FALLTHRU */
20249 case LT:
20250 case LTU:
20251 code = swap_condition (code);
20252 x = cop0, cop0 = cop1, cop1 = x;
20253 break;
20255 default:
20256 gcc_unreachable ();
20259 /* Only SSE4.1/SSE4.2 supports V2DImode. */
20260 if (mode == V2DImode)
20262 switch (code)
20264 case EQ:
20265 /* SSE4.1 supports EQ. */
20266 if (!TARGET_SSE4_1)
20267 return false;
20268 break;
20270 case GT:
20271 case GTU:
20272 /* SSE4.2 supports GT/GTU. */
20273 if (!TARGET_SSE4_2)
20274 return false;
20275 break;
20277 default:
20278 gcc_unreachable ();
20282 /* Unsigned parallel compare is not supported by the hardware.
20283 Play some tricks to turn this into a signed comparison
20284 against 0. */
20285 if (code == GTU)
20287 cop0 = force_reg (mode, cop0);
20289 switch (mode)
20291 case V8SImode:
20292 case V4DImode:
20293 case V4SImode:
20294 case V2DImode:
20296 rtx t1, t2, mask;
20297 rtx (*gen_sub3) (rtx, rtx, rtx);
20299 switch (mode)
20301 case V8SImode: gen_sub3 = gen_subv8si3; break;
20302 case V4DImode: gen_sub3 = gen_subv4di3; break;
20303 case V4SImode: gen_sub3 = gen_subv4si3; break;
20304 case V2DImode: gen_sub3 = gen_subv2di3; break;
20305 default:
20306 gcc_unreachable ();
20308 /* Subtract (-(INT MAX) - 1) from both operands to make
20309 them signed. */
20310 mask = ix86_build_signbit_mask (mode, true, false);
20311 t1 = gen_reg_rtx (mode);
20312 emit_insn (gen_sub3 (t1, cop0, mask));
20314 t2 = gen_reg_rtx (mode);
20315 emit_insn (gen_sub3 (t2, cop1, mask));
20317 cop0 = t1;
20318 cop1 = t2;
20319 code = GT;
20321 break;
20323 case V32QImode:
20324 case V16HImode:
20325 case V16QImode:
20326 case V8HImode:
20327 /* Perform a parallel unsigned saturating subtraction. */
20328 x = gen_reg_rtx (mode);
20329 emit_insn (gen_rtx_SET (VOIDmode, x,
20330 gen_rtx_US_MINUS (mode, cop0, cop1)));
20332 cop0 = x;
20333 cop1 = CONST0_RTX (mode);
20334 code = EQ;
20335 negate = !negate;
20336 break;
20338 default:
20339 gcc_unreachable ();
20344 /* Allow the comparison to be done in one mode, but the movcc to
20345 happen in another mode. */
20346 if (data_mode == mode)
20348 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
20349 operands[1+negate], operands[2-negate]);
20351 else
20353 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
20354 x = ix86_expand_sse_cmp (gen_lowpart (mode, operands[0]),
20355 code, cop0, cop1,
20356 operands[1+negate], operands[2-negate]);
20357 x = gen_lowpart (data_mode, x);
20360 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
20361 operands[2-negate]);
20362 return true;
20365 /* Expand a variable vector permutation. */
20367 void
20368 ix86_expand_vec_perm (rtx operands[])
20370 rtx target = operands[0];
20371 rtx op0 = operands[1];
20372 rtx op1 = operands[2];
20373 rtx mask = operands[3];
20374 rtx t1, t2, t3, t4, vt, vt2, vec[32];
20375 enum machine_mode mode = GET_MODE (op0);
20376 enum machine_mode maskmode = GET_MODE (mask);
20377 int w, e, i;
20378 bool one_operand_shuffle = rtx_equal_p (op0, op1);
20380 /* Number of elements in the vector. */
20381 w = GET_MODE_NUNITS (mode);
20382 e = GET_MODE_UNIT_SIZE (mode);
20383 gcc_assert (w <= 32);
20385 if (TARGET_AVX2)
20387 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
20389 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
20390 an constant shuffle operand. With a tiny bit of effort we can
20391 use VPERMD instead. A re-interpretation stall for V4DFmode is
20392 unfortunate but there's no avoiding it.
20393 Similarly for V16HImode we don't have instructions for variable
20394 shuffling, while for V32QImode we can use after preparing suitable
20395 masks vpshufb; vpshufb; vpermq; vpor. */
20397 if (mode == V16HImode)
20399 maskmode = mode = V32QImode;
20400 w = 32;
20401 e = 1;
20403 else
20405 maskmode = mode = V8SImode;
20406 w = 8;
20407 e = 4;
20409 t1 = gen_reg_rtx (maskmode);
20411 /* Replicate the low bits of the V4DImode mask into V8SImode:
20412 mask = { A B C D }
20413 t1 = { A A B B C C D D }. */
20414 for (i = 0; i < w / 2; ++i)
20415 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
20416 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20417 vt = force_reg (maskmode, vt);
20418 mask = gen_lowpart (maskmode, mask);
20419 if (maskmode == V8SImode)
20420 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
20421 else
20422 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
20424 /* Multiply the shuffle indicies by two. */
20425 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
20426 OPTAB_DIRECT);
20428 /* Add one to the odd shuffle indicies:
20429 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
20430 for (i = 0; i < w / 2; ++i)
20432 vec[i * 2] = const0_rtx;
20433 vec[i * 2 + 1] = const1_rtx;
20435 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20436 vt = force_const_mem (maskmode, vt);
20437 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
20438 OPTAB_DIRECT);
20440 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
20441 operands[3] = mask = t1;
20442 target = gen_lowpart (mode, target);
20443 op0 = gen_lowpart (mode, op0);
20444 op1 = gen_lowpart (mode, op1);
20447 switch (mode)
20449 case V8SImode:
20450 /* The VPERMD and VPERMPS instructions already properly ignore
20451 the high bits of the shuffle elements. No need for us to
20452 perform an AND ourselves. */
20453 if (one_operand_shuffle)
20454 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
20455 else
20457 t1 = gen_reg_rtx (V8SImode);
20458 t2 = gen_reg_rtx (V8SImode);
20459 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
20460 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
20461 goto merge_two;
20463 return;
20465 case V8SFmode:
20466 mask = gen_lowpart (V8SFmode, mask);
20467 if (one_operand_shuffle)
20468 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
20469 else
20471 t1 = gen_reg_rtx (V8SFmode);
20472 t2 = gen_reg_rtx (V8SFmode);
20473 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
20474 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
20475 goto merge_two;
20477 return;
20479 case V4SImode:
20480 /* By combining the two 128-bit input vectors into one 256-bit
20481 input vector, we can use VPERMD and VPERMPS for the full
20482 two-operand shuffle. */
20483 t1 = gen_reg_rtx (V8SImode);
20484 t2 = gen_reg_rtx (V8SImode);
20485 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
20486 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
20487 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
20488 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
20489 return;
20491 case V4SFmode:
20492 t1 = gen_reg_rtx (V8SFmode);
20493 t2 = gen_reg_rtx (V8SImode);
20494 mask = gen_lowpart (V4SImode, mask);
20495 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
20496 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
20497 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
20498 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
20499 return;
20501 case V32QImode:
20502 t1 = gen_reg_rtx (V32QImode);
20503 t2 = gen_reg_rtx (V32QImode);
20504 t3 = gen_reg_rtx (V32QImode);
20505 vt2 = GEN_INT (128);
20506 for (i = 0; i < 32; i++)
20507 vec[i] = vt2;
20508 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20509 vt = force_reg (V32QImode, vt);
20510 for (i = 0; i < 32; i++)
20511 vec[i] = i < 16 ? vt2 : const0_rtx;
20512 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20513 vt2 = force_reg (V32QImode, vt2);
20514 /* From mask create two adjusted masks, which contain the same
20515 bits as mask in the low 7 bits of each vector element.
20516 The first mask will have the most significant bit clear
20517 if it requests element from the same 128-bit lane
20518 and MSB set if it requests element from the other 128-bit lane.
20519 The second mask will have the opposite values of the MSB,
20520 and additionally will have its 128-bit lanes swapped.
20521 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
20522 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
20523 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
20524 stands for other 12 bytes. */
20525 /* The bit whether element is from the same lane or the other
20526 lane is bit 4, so shift it up by 3 to the MSB position. */
20527 emit_insn (gen_ashlv4di3 (gen_lowpart (V4DImode, t1),
20528 gen_lowpart (V4DImode, mask),
20529 GEN_INT (3)));
20530 /* Clear MSB bits from the mask just in case it had them set. */
20531 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
20532 /* After this t1 will have MSB set for elements from other lane. */
20533 emit_insn (gen_xorv32qi3 (t1, t1, vt2));
20534 /* Clear bits other than MSB. */
20535 emit_insn (gen_andv32qi3 (t1, t1, vt));
20536 /* Or in the lower bits from mask into t3. */
20537 emit_insn (gen_iorv32qi3 (t3, t1, t2));
20538 /* And invert MSB bits in t1, so MSB is set for elements from the same
20539 lane. */
20540 emit_insn (gen_xorv32qi3 (t1, t1, vt));
20541 /* Swap 128-bit lanes in t3. */
20542 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20543 gen_lowpart (V4DImode, t3),
20544 const2_rtx, GEN_INT (3),
20545 const0_rtx, const1_rtx));
20546 /* And or in the lower bits from mask into t1. */
20547 emit_insn (gen_iorv32qi3 (t1, t1, t2));
20548 if (one_operand_shuffle)
20550 /* Each of these shuffles will put 0s in places where
20551 element from the other 128-bit lane is needed, otherwise
20552 will shuffle in the requested value. */
20553 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3));
20554 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
20555 /* For t3 the 128-bit lanes are swapped again. */
20556 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20557 gen_lowpart (V4DImode, t3),
20558 const2_rtx, GEN_INT (3),
20559 const0_rtx, const1_rtx));
20560 /* And oring both together leads to the result. */
20561 emit_insn (gen_iorv32qi3 (target, t1, t3));
20562 return;
20565 t4 = gen_reg_rtx (V32QImode);
20566 /* Similarly to the above one_operand_shuffle code,
20567 just for repeated twice for each operand. merge_two:
20568 code will merge the two results together. */
20569 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3));
20570 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3));
20571 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
20572 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
20573 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4),
20574 gen_lowpart (V4DImode, t4),
20575 const2_rtx, GEN_INT (3),
20576 const0_rtx, const1_rtx));
20577 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20578 gen_lowpart (V4DImode, t3),
20579 const2_rtx, GEN_INT (3),
20580 const0_rtx, const1_rtx));
20581 emit_insn (gen_iorv32qi3 (t4, t2, t4));
20582 emit_insn (gen_iorv32qi3 (t3, t1, t3));
20583 t1 = t4;
20584 t2 = t3;
20585 goto merge_two;
20587 default:
20588 gcc_assert (GET_MODE_SIZE (mode) <= 16);
20589 break;
20593 if (TARGET_XOP)
20595 /* The XOP VPPERM insn supports three inputs. By ignoring the
20596 one_operand_shuffle special case, we avoid creating another
20597 set of constant vectors in memory. */
20598 one_operand_shuffle = false;
20600 /* mask = mask & {2*w-1, ...} */
20601 vt = GEN_INT (2*w - 1);
20603 else
20605 /* mask = mask & {w-1, ...} */
20606 vt = GEN_INT (w - 1);
20609 for (i = 0; i < w; i++)
20610 vec[i] = vt;
20611 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20612 mask = expand_simple_binop (maskmode, AND, mask, vt,
20613 NULL_RTX, 0, OPTAB_DIRECT);
20615 /* For non-QImode operations, convert the word permutation control
20616 into a byte permutation control. */
20617 if (mode != V16QImode)
20619 mask = expand_simple_binop (maskmode, ASHIFT, mask,
20620 GEN_INT (exact_log2 (e)),
20621 NULL_RTX, 0, OPTAB_DIRECT);
20623 /* Convert mask to vector of chars. */
20624 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
20626 /* Replicate each of the input bytes into byte positions:
20627 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
20628 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
20629 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
20630 for (i = 0; i < 16; ++i)
20631 vec[i] = GEN_INT (i/e * e);
20632 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20633 vt = force_const_mem (V16QImode, vt);
20634 if (TARGET_XOP)
20635 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
20636 else
20637 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
20639 /* Convert it into the byte positions by doing
20640 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
20641 for (i = 0; i < 16; ++i)
20642 vec[i] = GEN_INT (i % e);
20643 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20644 vt = force_const_mem (V16QImode, vt);
20645 emit_insn (gen_addv16qi3 (mask, mask, vt));
20648 /* The actual shuffle operations all operate on V16QImode. */
20649 op0 = gen_lowpart (V16QImode, op0);
20650 op1 = gen_lowpart (V16QImode, op1);
20651 target = gen_lowpart (V16QImode, target);
20653 if (TARGET_XOP)
20655 emit_insn (gen_xop_pperm (target, op0, op1, mask));
20657 else if (one_operand_shuffle)
20659 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
20661 else
20663 rtx xops[6];
20664 bool ok;
20666 /* Shuffle the two input vectors independently. */
20667 t1 = gen_reg_rtx (V16QImode);
20668 t2 = gen_reg_rtx (V16QImode);
20669 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
20670 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
20672 merge_two:
20673 /* Then merge them together. The key is whether any given control
20674 element contained a bit set that indicates the second word. */
20675 mask = operands[3];
20676 vt = GEN_INT (w);
20677 if (maskmode == V2DImode && !TARGET_SSE4_1)
20679 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
20680 more shuffle to convert the V2DI input mask into a V4SI
20681 input mask. At which point the masking that expand_int_vcond
20682 will work as desired. */
20683 rtx t3 = gen_reg_rtx (V4SImode);
20684 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
20685 const0_rtx, const0_rtx,
20686 const2_rtx, const2_rtx));
20687 mask = t3;
20688 maskmode = V4SImode;
20689 e = w = 4;
20692 for (i = 0; i < w; i++)
20693 vec[i] = vt;
20694 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20695 vt = force_reg (maskmode, vt);
20696 mask = expand_simple_binop (maskmode, AND, mask, vt,
20697 NULL_RTX, 0, OPTAB_DIRECT);
20699 xops[0] = gen_lowpart (mode, operands[0]);
20700 xops[1] = gen_lowpart (mode, t2);
20701 xops[2] = gen_lowpart (mode, t1);
20702 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
20703 xops[4] = mask;
20704 xops[5] = vt;
20705 ok = ix86_expand_int_vcond (xops);
20706 gcc_assert (ok);
20710 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
20711 true if we should do zero extension, else sign extension. HIGH_P is
20712 true if we want the N/2 high elements, else the low elements. */
20714 void
20715 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
20717 enum machine_mode imode = GET_MODE (src);
20718 rtx tmp;
20720 if (TARGET_SSE4_1)
20722 rtx (*unpack)(rtx, rtx);
20723 rtx (*extract)(rtx, rtx) = NULL;
20724 enum machine_mode halfmode = BLKmode;
20726 switch (imode)
20728 case V32QImode:
20729 if (unsigned_p)
20730 unpack = gen_avx2_zero_extendv16qiv16hi2;
20731 else
20732 unpack = gen_avx2_sign_extendv16qiv16hi2;
20733 halfmode = V16QImode;
20734 extract
20735 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
20736 break;
20737 case V16HImode:
20738 if (unsigned_p)
20739 unpack = gen_avx2_zero_extendv8hiv8si2;
20740 else
20741 unpack = gen_avx2_sign_extendv8hiv8si2;
20742 halfmode = V8HImode;
20743 extract
20744 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
20745 break;
20746 case V8SImode:
20747 if (unsigned_p)
20748 unpack = gen_avx2_zero_extendv4siv4di2;
20749 else
20750 unpack = gen_avx2_sign_extendv4siv4di2;
20751 halfmode = V4SImode;
20752 extract
20753 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
20754 break;
20755 case V16QImode:
20756 if (unsigned_p)
20757 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
20758 else
20759 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
20760 break;
20761 case V8HImode:
20762 if (unsigned_p)
20763 unpack = gen_sse4_1_zero_extendv4hiv4si2;
20764 else
20765 unpack = gen_sse4_1_sign_extendv4hiv4si2;
20766 break;
20767 case V4SImode:
20768 if (unsigned_p)
20769 unpack = gen_sse4_1_zero_extendv2siv2di2;
20770 else
20771 unpack = gen_sse4_1_sign_extendv2siv2di2;
20772 break;
20773 default:
20774 gcc_unreachable ();
20777 if (GET_MODE_SIZE (imode) == 32)
20779 tmp = gen_reg_rtx (halfmode);
20780 emit_insn (extract (tmp, src));
20782 else if (high_p)
20784 /* Shift higher 8 bytes to lower 8 bytes. */
20785 tmp = gen_reg_rtx (imode);
20786 emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp),
20787 gen_lowpart (V1TImode, src),
20788 GEN_INT (64)));
20790 else
20791 tmp = src;
20793 emit_insn (unpack (dest, tmp));
20795 else
20797 rtx (*unpack)(rtx, rtx, rtx);
20799 switch (imode)
20801 case V16QImode:
20802 if (high_p)
20803 unpack = gen_vec_interleave_highv16qi;
20804 else
20805 unpack = gen_vec_interleave_lowv16qi;
20806 break;
20807 case V8HImode:
20808 if (high_p)
20809 unpack = gen_vec_interleave_highv8hi;
20810 else
20811 unpack = gen_vec_interleave_lowv8hi;
20812 break;
20813 case V4SImode:
20814 if (high_p)
20815 unpack = gen_vec_interleave_highv4si;
20816 else
20817 unpack = gen_vec_interleave_lowv4si;
20818 break;
20819 default:
20820 gcc_unreachable ();
20823 if (unsigned_p)
20824 tmp = force_reg (imode, CONST0_RTX (imode));
20825 else
20826 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
20827 src, pc_rtx, pc_rtx);
20829 emit_insn (unpack (gen_lowpart (imode, dest), src, tmp));
20833 /* Expand conditional increment or decrement using adb/sbb instructions.
20834 The default case using setcc followed by the conditional move can be
20835 done by generic code. */
20836 bool
20837 ix86_expand_int_addcc (rtx operands[])
20839 enum rtx_code code = GET_CODE (operands[1]);
20840 rtx flags;
20841 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
20842 rtx compare_op;
20843 rtx val = const0_rtx;
20844 bool fpcmp = false;
20845 enum machine_mode mode;
20846 rtx op0 = XEXP (operands[1], 0);
20847 rtx op1 = XEXP (operands[1], 1);
20849 if (operands[3] != const1_rtx
20850 && operands[3] != constm1_rtx)
20851 return false;
20852 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
20853 return false;
20854 code = GET_CODE (compare_op);
20856 flags = XEXP (compare_op, 0);
20858 if (GET_MODE (flags) == CCFPmode
20859 || GET_MODE (flags) == CCFPUmode)
20861 fpcmp = true;
20862 code = ix86_fp_compare_code_to_integer (code);
20865 if (code != LTU)
20867 val = constm1_rtx;
20868 if (fpcmp)
20869 PUT_CODE (compare_op,
20870 reverse_condition_maybe_unordered
20871 (GET_CODE (compare_op)));
20872 else
20873 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
20876 mode = GET_MODE (operands[0]);
20878 /* Construct either adc or sbb insn. */
20879 if ((code == LTU) == (operands[3] == constm1_rtx))
20881 switch (mode)
20883 case QImode:
20884 insn = gen_subqi3_carry;
20885 break;
20886 case HImode:
20887 insn = gen_subhi3_carry;
20888 break;
20889 case SImode:
20890 insn = gen_subsi3_carry;
20891 break;
20892 case DImode:
20893 insn = gen_subdi3_carry;
20894 break;
20895 default:
20896 gcc_unreachable ();
20899 else
20901 switch (mode)
20903 case QImode:
20904 insn = gen_addqi3_carry;
20905 break;
20906 case HImode:
20907 insn = gen_addhi3_carry;
20908 break;
20909 case SImode:
20910 insn = gen_addsi3_carry;
20911 break;
20912 case DImode:
20913 insn = gen_adddi3_carry;
20914 break;
20915 default:
20916 gcc_unreachable ();
20919 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
20921 return true;
20925 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
20926 but works for floating pointer parameters and nonoffsetable memories.
20927 For pushes, it returns just stack offsets; the values will be saved
20928 in the right order. Maximally three parts are generated. */
20930 static int
20931 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
20933 int size;
20935 if (!TARGET_64BIT)
20936 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
20937 else
20938 size = (GET_MODE_SIZE (mode) + 4) / 8;
20940 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
20941 gcc_assert (size >= 2 && size <= 4);
20943 /* Optimize constant pool reference to immediates. This is used by fp
20944 moves, that force all constants to memory to allow combining. */
20945 if (MEM_P (operand) && MEM_READONLY_P (operand))
20947 rtx tmp = maybe_get_pool_constant (operand);
20948 if (tmp)
20949 operand = tmp;
20952 if (MEM_P (operand) && !offsettable_memref_p (operand))
20954 /* The only non-offsetable memories we handle are pushes. */
20955 int ok = push_operand (operand, VOIDmode);
20957 gcc_assert (ok);
20959 operand = copy_rtx (operand);
20960 PUT_MODE (operand, word_mode);
20961 parts[0] = parts[1] = parts[2] = parts[3] = operand;
20962 return size;
20965 if (GET_CODE (operand) == CONST_VECTOR)
20967 enum machine_mode imode = int_mode_for_mode (mode);
20968 /* Caution: if we looked through a constant pool memory above,
20969 the operand may actually have a different mode now. That's
20970 ok, since we want to pun this all the way back to an integer. */
20971 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
20972 gcc_assert (operand != NULL);
20973 mode = imode;
20976 if (!TARGET_64BIT)
20978 if (mode == DImode)
20979 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20980 else
20982 int i;
20984 if (REG_P (operand))
20986 gcc_assert (reload_completed);
20987 for (i = 0; i < size; i++)
20988 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
20990 else if (offsettable_memref_p (operand))
20992 operand = adjust_address (operand, SImode, 0);
20993 parts[0] = operand;
20994 for (i = 1; i < size; i++)
20995 parts[i] = adjust_address (operand, SImode, 4 * i);
20997 else if (GET_CODE (operand) == CONST_DOUBLE)
20999 REAL_VALUE_TYPE r;
21000 long l[4];
21002 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
21003 switch (mode)
21005 case TFmode:
21006 real_to_target (l, &r, mode);
21007 parts[3] = gen_int_mode (l[3], SImode);
21008 parts[2] = gen_int_mode (l[2], SImode);
21009 break;
21010 case XFmode:
21011 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
21012 long double may not be 80-bit. */
21013 real_to_target (l, &r, mode);
21014 parts[2] = gen_int_mode (l[2], SImode);
21015 break;
21016 case DFmode:
21017 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
21018 break;
21019 default:
21020 gcc_unreachable ();
21022 parts[1] = gen_int_mode (l[1], SImode);
21023 parts[0] = gen_int_mode (l[0], SImode);
21025 else
21026 gcc_unreachable ();
21029 else
21031 if (mode == TImode)
21032 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
21033 if (mode == XFmode || mode == TFmode)
21035 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
21036 if (REG_P (operand))
21038 gcc_assert (reload_completed);
21039 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
21040 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
21042 else if (offsettable_memref_p (operand))
21044 operand = adjust_address (operand, DImode, 0);
21045 parts[0] = operand;
21046 parts[1] = adjust_address (operand, upper_mode, 8);
21048 else if (GET_CODE (operand) == CONST_DOUBLE)
21050 REAL_VALUE_TYPE r;
21051 long l[4];
21053 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
21054 real_to_target (l, &r, mode);
21056 /* Do not use shift by 32 to avoid warning on 32bit systems. */
21057 if (HOST_BITS_PER_WIDE_INT >= 64)
21058 parts[0]
21059 = gen_int_mode
21060 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
21061 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
21062 DImode);
21063 else
21064 parts[0] = immed_double_const (l[0], l[1], DImode);
21066 if (upper_mode == SImode)
21067 parts[1] = gen_int_mode (l[2], SImode);
21068 else if (HOST_BITS_PER_WIDE_INT >= 64)
21069 parts[1]
21070 = gen_int_mode
21071 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
21072 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
21073 DImode);
21074 else
21075 parts[1] = immed_double_const (l[2], l[3], DImode);
21077 else
21078 gcc_unreachable ();
21082 return size;
21085 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
21086 Return false when normal moves are needed; true when all required
21087 insns have been emitted. Operands 2-4 contain the input values
21088 int the correct order; operands 5-7 contain the output values. */
21090 void
21091 ix86_split_long_move (rtx operands[])
21093 rtx part[2][4];
21094 int nparts, i, j;
21095 int push = 0;
21096 int collisions = 0;
21097 enum machine_mode mode = GET_MODE (operands[0]);
21098 bool collisionparts[4];
21100 /* The DFmode expanders may ask us to move double.
21101 For 64bit target this is single move. By hiding the fact
21102 here we simplify i386.md splitters. */
21103 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
21105 /* Optimize constant pool reference to immediates. This is used by
21106 fp moves, that force all constants to memory to allow combining. */
21108 if (MEM_P (operands[1])
21109 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
21110 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
21111 operands[1] = get_pool_constant (XEXP (operands[1], 0));
21112 if (push_operand (operands[0], VOIDmode))
21114 operands[0] = copy_rtx (operands[0]);
21115 PUT_MODE (operands[0], word_mode);
21117 else
21118 operands[0] = gen_lowpart (DImode, operands[0]);
21119 operands[1] = gen_lowpart (DImode, operands[1]);
21120 emit_move_insn (operands[0], operands[1]);
21121 return;
21124 /* The only non-offsettable memory we handle is push. */
21125 if (push_operand (operands[0], VOIDmode))
21126 push = 1;
21127 else
21128 gcc_assert (!MEM_P (operands[0])
21129 || offsettable_memref_p (operands[0]));
21131 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
21132 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
21134 /* When emitting push, take care for source operands on the stack. */
21135 if (push && MEM_P (operands[1])
21136 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
21138 rtx src_base = XEXP (part[1][nparts - 1], 0);
21140 /* Compensate for the stack decrement by 4. */
21141 if (!TARGET_64BIT && nparts == 3
21142 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
21143 src_base = plus_constant (Pmode, src_base, 4);
21145 /* src_base refers to the stack pointer and is
21146 automatically decreased by emitted push. */
21147 for (i = 0; i < nparts; i++)
21148 part[1][i] = change_address (part[1][i],
21149 GET_MODE (part[1][i]), src_base);
21152 /* We need to do copy in the right order in case an address register
21153 of the source overlaps the destination. */
21154 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
21156 rtx tmp;
21158 for (i = 0; i < nparts; i++)
21160 collisionparts[i]
21161 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
21162 if (collisionparts[i])
21163 collisions++;
21166 /* Collision in the middle part can be handled by reordering. */
21167 if (collisions == 1 && nparts == 3 && collisionparts [1])
21169 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21170 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21172 else if (collisions == 1
21173 && nparts == 4
21174 && (collisionparts [1] || collisionparts [2]))
21176 if (collisionparts [1])
21178 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21179 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21181 else
21183 tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
21184 tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
21188 /* If there are more collisions, we can't handle it by reordering.
21189 Do an lea to the last part and use only one colliding move. */
21190 else if (collisions > 1)
21192 rtx base;
21194 collisions = 1;
21196 base = part[0][nparts - 1];
21198 /* Handle the case when the last part isn't valid for lea.
21199 Happens in 64-bit mode storing the 12-byte XFmode. */
21200 if (GET_MODE (base) != Pmode)
21201 base = gen_rtx_REG (Pmode, REGNO (base));
21203 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
21204 part[1][0] = replace_equiv_address (part[1][0], base);
21205 for (i = 1; i < nparts; i++)
21207 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
21208 part[1][i] = replace_equiv_address (part[1][i], tmp);
21213 if (push)
21215 if (!TARGET_64BIT)
21217 if (nparts == 3)
21219 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
21220 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
21221 stack_pointer_rtx, GEN_INT (-4)));
21222 emit_move_insn (part[0][2], part[1][2]);
21224 else if (nparts == 4)
21226 emit_move_insn (part[0][3], part[1][3]);
21227 emit_move_insn (part[0][2], part[1][2]);
21230 else
21232 /* In 64bit mode we don't have 32bit push available. In case this is
21233 register, it is OK - we will just use larger counterpart. We also
21234 retype memory - these comes from attempt to avoid REX prefix on
21235 moving of second half of TFmode value. */
21236 if (GET_MODE (part[1][1]) == SImode)
21238 switch (GET_CODE (part[1][1]))
21240 case MEM:
21241 part[1][1] = adjust_address (part[1][1], DImode, 0);
21242 break;
21244 case REG:
21245 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
21246 break;
21248 default:
21249 gcc_unreachable ();
21252 if (GET_MODE (part[1][0]) == SImode)
21253 part[1][0] = part[1][1];
21256 emit_move_insn (part[0][1], part[1][1]);
21257 emit_move_insn (part[0][0], part[1][0]);
21258 return;
21261 /* Choose correct order to not overwrite the source before it is copied. */
21262 if ((REG_P (part[0][0])
21263 && REG_P (part[1][1])
21264 && (REGNO (part[0][0]) == REGNO (part[1][1])
21265 || (nparts == 3
21266 && REGNO (part[0][0]) == REGNO (part[1][2]))
21267 || (nparts == 4
21268 && REGNO (part[0][0]) == REGNO (part[1][3]))))
21269 || (collisions > 0
21270 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
21272 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
21274 operands[2 + i] = part[0][j];
21275 operands[6 + i] = part[1][j];
21278 else
21280 for (i = 0; i < nparts; i++)
21282 operands[2 + i] = part[0][i];
21283 operands[6 + i] = part[1][i];
21287 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
21288 if (optimize_insn_for_size_p ())
21290 for (j = 0; j < nparts - 1; j++)
21291 if (CONST_INT_P (operands[6 + j])
21292 && operands[6 + j] != const0_rtx
21293 && REG_P (operands[2 + j]))
21294 for (i = j; i < nparts - 1; i++)
21295 if (CONST_INT_P (operands[7 + i])
21296 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
21297 operands[7 + i] = operands[2 + j];
21300 for (i = 0; i < nparts; i++)
21301 emit_move_insn (operands[2 + i], operands[6 + i]);
21303 return;
21306 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
21307 left shift by a constant, either using a single shift or
21308 a sequence of add instructions. */
21310 static void
21311 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
21313 rtx (*insn)(rtx, rtx, rtx);
21315 if (count == 1
21316 || (count * ix86_cost->add <= ix86_cost->shift_const
21317 && !optimize_insn_for_size_p ()))
21319 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
21320 while (count-- > 0)
21321 emit_insn (insn (operand, operand, operand));
21323 else
21325 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21326 emit_insn (insn (operand, operand, GEN_INT (count)));
21330 void
21331 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
21333 rtx (*gen_ashl3)(rtx, rtx, rtx);
21334 rtx (*gen_shld)(rtx, rtx, rtx);
21335 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21337 rtx low[2], high[2];
21338 int count;
21340 if (CONST_INT_P (operands[2]))
21342 split_double_mode (mode, operands, 2, low, high);
21343 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21345 if (count >= half_width)
21347 emit_move_insn (high[0], low[1]);
21348 emit_move_insn (low[0], const0_rtx);
21350 if (count > half_width)
21351 ix86_expand_ashl_const (high[0], count - half_width, mode);
21353 else
21355 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21357 if (!rtx_equal_p (operands[0], operands[1]))
21358 emit_move_insn (operands[0], operands[1]);
21360 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
21361 ix86_expand_ashl_const (low[0], count, mode);
21363 return;
21366 split_double_mode (mode, operands, 1, low, high);
21368 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21370 if (operands[1] == const1_rtx)
21372 /* Assuming we've chosen a QImode capable registers, then 1 << N
21373 can be done with two 32/64-bit shifts, no branches, no cmoves. */
21374 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
21376 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
21378 ix86_expand_clear (low[0]);
21379 ix86_expand_clear (high[0]);
21380 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
21382 d = gen_lowpart (QImode, low[0]);
21383 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21384 s = gen_rtx_EQ (QImode, flags, const0_rtx);
21385 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21387 d = gen_lowpart (QImode, high[0]);
21388 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21389 s = gen_rtx_NE (QImode, flags, const0_rtx);
21390 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21393 /* Otherwise, we can get the same results by manually performing
21394 a bit extract operation on bit 5/6, and then performing the two
21395 shifts. The two methods of getting 0/1 into low/high are exactly
21396 the same size. Avoiding the shift in the bit extract case helps
21397 pentium4 a bit; no one else seems to care much either way. */
21398 else
21400 enum machine_mode half_mode;
21401 rtx (*gen_lshr3)(rtx, rtx, rtx);
21402 rtx (*gen_and3)(rtx, rtx, rtx);
21403 rtx (*gen_xor3)(rtx, rtx, rtx);
21404 HOST_WIDE_INT bits;
21405 rtx x;
21407 if (mode == DImode)
21409 half_mode = SImode;
21410 gen_lshr3 = gen_lshrsi3;
21411 gen_and3 = gen_andsi3;
21412 gen_xor3 = gen_xorsi3;
21413 bits = 5;
21415 else
21417 half_mode = DImode;
21418 gen_lshr3 = gen_lshrdi3;
21419 gen_and3 = gen_anddi3;
21420 gen_xor3 = gen_xordi3;
21421 bits = 6;
21424 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
21425 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
21426 else
21427 x = gen_lowpart (half_mode, operands[2]);
21428 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
21430 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
21431 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
21432 emit_move_insn (low[0], high[0]);
21433 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
21436 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21437 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
21438 return;
21441 if (operands[1] == constm1_rtx)
21443 /* For -1 << N, we can avoid the shld instruction, because we
21444 know that we're shifting 0...31/63 ones into a -1. */
21445 emit_move_insn (low[0], constm1_rtx);
21446 if (optimize_insn_for_size_p ())
21447 emit_move_insn (high[0], low[0]);
21448 else
21449 emit_move_insn (high[0], constm1_rtx);
21451 else
21453 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21455 if (!rtx_equal_p (operands[0], operands[1]))
21456 emit_move_insn (operands[0], operands[1]);
21458 split_double_mode (mode, operands, 1, low, high);
21459 emit_insn (gen_shld (high[0], low[0], operands[2]));
21462 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21464 if (TARGET_CMOVE && scratch)
21466 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21467 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21469 ix86_expand_clear (scratch);
21470 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
21472 else
21474 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21475 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21477 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
21481 void
21482 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
21484 rtx (*gen_ashr3)(rtx, rtx, rtx)
21485 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
21486 rtx (*gen_shrd)(rtx, rtx, rtx);
21487 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21489 rtx low[2], high[2];
21490 int count;
21492 if (CONST_INT_P (operands[2]))
21494 split_double_mode (mode, operands, 2, low, high);
21495 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21497 if (count == GET_MODE_BITSIZE (mode) - 1)
21499 emit_move_insn (high[0], high[1]);
21500 emit_insn (gen_ashr3 (high[0], high[0],
21501 GEN_INT (half_width - 1)));
21502 emit_move_insn (low[0], high[0]);
21505 else if (count >= half_width)
21507 emit_move_insn (low[0], high[1]);
21508 emit_move_insn (high[0], low[0]);
21509 emit_insn (gen_ashr3 (high[0], high[0],
21510 GEN_INT (half_width - 1)));
21512 if (count > half_width)
21513 emit_insn (gen_ashr3 (low[0], low[0],
21514 GEN_INT (count - half_width)));
21516 else
21518 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21520 if (!rtx_equal_p (operands[0], operands[1]))
21521 emit_move_insn (operands[0], operands[1]);
21523 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21524 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
21527 else
21529 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21531 if (!rtx_equal_p (operands[0], operands[1]))
21532 emit_move_insn (operands[0], operands[1]);
21534 split_double_mode (mode, operands, 1, low, high);
21536 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21537 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
21539 if (TARGET_CMOVE && scratch)
21541 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21542 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21544 emit_move_insn (scratch, high[0]);
21545 emit_insn (gen_ashr3 (scratch, scratch,
21546 GEN_INT (half_width - 1)));
21547 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21548 scratch));
21550 else
21552 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
21553 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
21555 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
21560 void
21561 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
21563 rtx (*gen_lshr3)(rtx, rtx, rtx)
21564 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
21565 rtx (*gen_shrd)(rtx, rtx, rtx);
21566 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21568 rtx low[2], high[2];
21569 int count;
21571 if (CONST_INT_P (operands[2]))
21573 split_double_mode (mode, operands, 2, low, high);
21574 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21576 if (count >= half_width)
21578 emit_move_insn (low[0], high[1]);
21579 ix86_expand_clear (high[0]);
21581 if (count > half_width)
21582 emit_insn (gen_lshr3 (low[0], low[0],
21583 GEN_INT (count - half_width)));
21585 else
21587 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21589 if (!rtx_equal_p (operands[0], operands[1]))
21590 emit_move_insn (operands[0], operands[1]);
21592 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21593 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
21596 else
21598 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21600 if (!rtx_equal_p (operands[0], operands[1]))
21601 emit_move_insn (operands[0], operands[1]);
21603 split_double_mode (mode, operands, 1, low, high);
21605 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21606 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
21608 if (TARGET_CMOVE && scratch)
21610 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21611 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21613 ix86_expand_clear (scratch);
21614 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21615 scratch));
21617 else
21619 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21620 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21622 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
21627 /* Predict just emitted jump instruction to be taken with probability PROB. */
21628 static void
21629 predict_jump (int prob)
21631 rtx insn = get_last_insn ();
21632 gcc_assert (JUMP_P (insn));
21633 add_reg_note (insn, REG_BR_PROB, GEN_INT (prob));
21636 /* Helper function for the string operations below. Dest VARIABLE whether
21637 it is aligned to VALUE bytes. If true, jump to the label. */
21638 static rtx
21639 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
21641 rtx label = gen_label_rtx ();
21642 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
21643 if (GET_MODE (variable) == DImode)
21644 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
21645 else
21646 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
21647 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
21648 1, label);
21649 if (epilogue)
21650 predict_jump (REG_BR_PROB_BASE * 50 / 100);
21651 else
21652 predict_jump (REG_BR_PROB_BASE * 90 / 100);
21653 return label;
21656 /* Adjust COUNTER by the VALUE. */
21657 static void
21658 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
21660 rtx (*gen_add)(rtx, rtx, rtx)
21661 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
21663 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
21666 /* Zero extend possibly SImode EXP to Pmode register. */
21668 ix86_zero_extend_to_Pmode (rtx exp)
21670 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
21673 /* Divide COUNTREG by SCALE. */
21674 static rtx
21675 scale_counter (rtx countreg, int scale)
21677 rtx sc;
21679 if (scale == 1)
21680 return countreg;
21681 if (CONST_INT_P (countreg))
21682 return GEN_INT (INTVAL (countreg) / scale);
21683 gcc_assert (REG_P (countreg));
21685 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
21686 GEN_INT (exact_log2 (scale)),
21687 NULL, 1, OPTAB_DIRECT);
21688 return sc;
21691 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
21692 DImode for constant loop counts. */
21694 static enum machine_mode
21695 counter_mode (rtx count_exp)
21697 if (GET_MODE (count_exp) != VOIDmode)
21698 return GET_MODE (count_exp);
21699 if (!CONST_INT_P (count_exp))
21700 return Pmode;
21701 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
21702 return DImode;
21703 return SImode;
21706 /* When SRCPTR is non-NULL, output simple loop to move memory
21707 pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
21708 overall size is COUNT specified in bytes. When SRCPTR is NULL, output the
21709 equivalent loop to set memory by VALUE (supposed to be in MODE).
21711 The size is rounded down to whole number of chunk size moved at once.
21712 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
21715 static void
21716 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
21717 rtx destptr, rtx srcptr, rtx value,
21718 rtx count, enum machine_mode mode, int unroll,
21719 int expected_size)
21721 rtx out_label, top_label, iter, tmp;
21722 enum machine_mode iter_mode = counter_mode (count);
21723 rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
21724 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
21725 rtx size;
21726 rtx x_addr;
21727 rtx y_addr;
21728 int i;
21730 top_label = gen_label_rtx ();
21731 out_label = gen_label_rtx ();
21732 iter = gen_reg_rtx (iter_mode);
21734 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
21735 NULL, 1, OPTAB_DIRECT);
21736 /* Those two should combine. */
21737 if (piece_size == const1_rtx)
21739 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
21740 true, out_label);
21741 predict_jump (REG_BR_PROB_BASE * 10 / 100);
21743 emit_move_insn (iter, const0_rtx);
21745 emit_label (top_label);
21747 tmp = convert_modes (Pmode, iter_mode, iter, true);
21748 x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
21749 destmem = change_address (destmem, mode, x_addr);
21751 if (srcmem)
21753 y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
21754 srcmem = change_address (srcmem, mode, y_addr);
21756 /* When unrolling for chips that reorder memory reads and writes,
21757 we can save registers by using single temporary.
21758 Also using 4 temporaries is overkill in 32bit mode. */
21759 if (!TARGET_64BIT && 0)
21761 for (i = 0; i < unroll; i++)
21763 if (i)
21765 destmem =
21766 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21767 srcmem =
21768 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21770 emit_move_insn (destmem, srcmem);
21773 else
21775 rtx tmpreg[4];
21776 gcc_assert (unroll <= 4);
21777 for (i = 0; i < unroll; i++)
21779 tmpreg[i] = gen_reg_rtx (mode);
21780 if (i)
21782 srcmem =
21783 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21785 emit_move_insn (tmpreg[i], srcmem);
21787 for (i = 0; i < unroll; i++)
21789 if (i)
21791 destmem =
21792 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21794 emit_move_insn (destmem, tmpreg[i]);
21798 else
21799 for (i = 0; i < unroll; i++)
21801 if (i)
21802 destmem =
21803 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21804 emit_move_insn (destmem, value);
21807 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
21808 true, OPTAB_LIB_WIDEN);
21809 if (tmp != iter)
21810 emit_move_insn (iter, tmp);
21812 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
21813 true, top_label);
21814 if (expected_size != -1)
21816 expected_size /= GET_MODE_SIZE (mode) * unroll;
21817 if (expected_size == 0)
21818 predict_jump (0);
21819 else if (expected_size > REG_BR_PROB_BASE)
21820 predict_jump (REG_BR_PROB_BASE - 1);
21821 else
21822 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
21824 else
21825 predict_jump (REG_BR_PROB_BASE * 80 / 100);
21826 iter = ix86_zero_extend_to_Pmode (iter);
21827 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
21828 true, OPTAB_LIB_WIDEN);
21829 if (tmp != destptr)
21830 emit_move_insn (destptr, tmp);
21831 if (srcptr)
21833 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
21834 true, OPTAB_LIB_WIDEN);
21835 if (tmp != srcptr)
21836 emit_move_insn (srcptr, tmp);
21838 emit_label (out_label);
21841 /* Output "rep; mov" instruction.
21842 Arguments have same meaning as for previous function */
21843 static void
21844 expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
21845 rtx destptr, rtx srcptr,
21846 rtx count,
21847 enum machine_mode mode)
21849 rtx destexp;
21850 rtx srcexp;
21851 rtx countreg;
21852 HOST_WIDE_INT rounded_count;
21854 /* If the size is known, it is shorter to use rep movs. */
21855 if (mode == QImode && CONST_INT_P (count)
21856 && !(INTVAL (count) & 3))
21857 mode = SImode;
21859 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21860 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21861 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
21862 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
21863 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21864 if (mode != QImode)
21866 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21867 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21868 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21869 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
21870 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21871 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
21873 else
21875 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21876 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
21878 if (CONST_INT_P (count))
21880 rounded_count = (INTVAL (count)
21881 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21882 destmem = shallow_copy_rtx (destmem);
21883 srcmem = shallow_copy_rtx (srcmem);
21884 set_mem_size (destmem, rounded_count);
21885 set_mem_size (srcmem, rounded_count);
21887 else
21889 if (MEM_SIZE_KNOWN_P (destmem))
21890 clear_mem_size (destmem);
21891 if (MEM_SIZE_KNOWN_P (srcmem))
21892 clear_mem_size (srcmem);
21894 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
21895 destexp, srcexp));
21898 /* Output "rep; stos" instruction.
21899 Arguments have same meaning as for previous function */
21900 static void
21901 expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
21902 rtx count, enum machine_mode mode,
21903 rtx orig_value)
21905 rtx destexp;
21906 rtx countreg;
21907 HOST_WIDE_INT rounded_count;
21909 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21910 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21911 value = force_reg (mode, gen_lowpart (mode, value));
21912 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21913 if (mode != QImode)
21915 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21916 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21917 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21919 else
21920 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21921 if (orig_value == const0_rtx && CONST_INT_P (count))
21923 rounded_count = (INTVAL (count)
21924 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21925 destmem = shallow_copy_rtx (destmem);
21926 set_mem_size (destmem, rounded_count);
21928 else if (MEM_SIZE_KNOWN_P (destmem))
21929 clear_mem_size (destmem);
21930 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
21933 static void
21934 emit_strmov (rtx destmem, rtx srcmem,
21935 rtx destptr, rtx srcptr, enum machine_mode mode, int offset)
21937 rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset);
21938 rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
21939 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21942 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
21943 static void
21944 expand_movmem_epilogue (rtx destmem, rtx srcmem,
21945 rtx destptr, rtx srcptr, rtx count, int max_size)
21947 rtx src, dest;
21948 if (CONST_INT_P (count))
21950 HOST_WIDE_INT countval = INTVAL (count);
21951 int offset = 0;
21953 if ((countval & 0x10) && max_size > 16)
21955 if (TARGET_64BIT)
21957 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21958 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
21960 else
21961 gcc_unreachable ();
21962 offset += 16;
21964 if ((countval & 0x08) && max_size > 8)
21966 if (TARGET_64BIT)
21967 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21968 else
21970 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21971 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset + 4);
21973 offset += 8;
21975 if ((countval & 0x04) && max_size > 4)
21977 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21978 offset += 4;
21980 if ((countval & 0x02) && max_size > 2)
21982 emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
21983 offset += 2;
21985 if ((countval & 0x01) && max_size > 1)
21987 emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
21988 offset += 1;
21990 return;
21992 if (max_size > 8)
21994 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
21995 count, 1, OPTAB_DIRECT);
21996 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
21997 count, QImode, 1, 4);
21998 return;
22001 /* When there are stringops, we can cheaply increase dest and src pointers.
22002 Otherwise we save code size by maintaining offset (zero is readily
22003 available from preceding rep operation) and using x86 addressing modes.
22005 if (TARGET_SINGLE_STRINGOP)
22007 if (max_size > 4)
22009 rtx label = ix86_expand_aligntest (count, 4, true);
22010 src = change_address (srcmem, SImode, srcptr);
22011 dest = change_address (destmem, SImode, destptr);
22012 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22013 emit_label (label);
22014 LABEL_NUSES (label) = 1;
22016 if (max_size > 2)
22018 rtx label = ix86_expand_aligntest (count, 2, true);
22019 src = change_address (srcmem, HImode, srcptr);
22020 dest = change_address (destmem, HImode, destptr);
22021 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22022 emit_label (label);
22023 LABEL_NUSES (label) = 1;
22025 if (max_size > 1)
22027 rtx label = ix86_expand_aligntest (count, 1, true);
22028 src = change_address (srcmem, QImode, srcptr);
22029 dest = change_address (destmem, QImode, destptr);
22030 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22031 emit_label (label);
22032 LABEL_NUSES (label) = 1;
22035 else
22037 rtx offset = force_reg (Pmode, const0_rtx);
22038 rtx tmp;
22040 if (max_size > 4)
22042 rtx label = ix86_expand_aligntest (count, 4, true);
22043 src = change_address (srcmem, SImode, srcptr);
22044 dest = change_address (destmem, SImode, destptr);
22045 emit_move_insn (dest, src);
22046 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
22047 true, OPTAB_LIB_WIDEN);
22048 if (tmp != offset)
22049 emit_move_insn (offset, tmp);
22050 emit_label (label);
22051 LABEL_NUSES (label) = 1;
22053 if (max_size > 2)
22055 rtx label = ix86_expand_aligntest (count, 2, true);
22056 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22057 src = change_address (srcmem, HImode, tmp);
22058 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22059 dest = change_address (destmem, HImode, tmp);
22060 emit_move_insn (dest, src);
22061 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
22062 true, OPTAB_LIB_WIDEN);
22063 if (tmp != offset)
22064 emit_move_insn (offset, tmp);
22065 emit_label (label);
22066 LABEL_NUSES (label) = 1;
22068 if (max_size > 1)
22070 rtx label = ix86_expand_aligntest (count, 1, true);
22071 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22072 src = change_address (srcmem, QImode, tmp);
22073 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22074 dest = change_address (destmem, QImode, tmp);
22075 emit_move_insn (dest, src);
22076 emit_label (label);
22077 LABEL_NUSES (label) = 1;
22082 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22083 static void
22084 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
22085 rtx count, int max_size)
22087 count =
22088 expand_simple_binop (counter_mode (count), AND, count,
22089 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
22090 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
22091 gen_lowpart (QImode, value), count, QImode,
22092 1, max_size / 2);
22095 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22096 static void
22097 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size)
22099 rtx dest;
22101 if (CONST_INT_P (count))
22103 HOST_WIDE_INT countval = INTVAL (count);
22104 int offset = 0;
22106 if ((countval & 0x10) && max_size > 16)
22108 if (TARGET_64BIT)
22110 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
22111 emit_insn (gen_strset (destptr, dest, value));
22112 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8);
22113 emit_insn (gen_strset (destptr, dest, value));
22115 else
22116 gcc_unreachable ();
22117 offset += 16;
22119 if ((countval & 0x08) && max_size > 8)
22121 if (TARGET_64BIT)
22123 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
22124 emit_insn (gen_strset (destptr, dest, value));
22126 else
22128 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
22129 emit_insn (gen_strset (destptr, dest, value));
22130 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4);
22131 emit_insn (gen_strset (destptr, dest, value));
22133 offset += 8;
22135 if ((countval & 0x04) && max_size > 4)
22137 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
22138 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
22139 offset += 4;
22141 if ((countval & 0x02) && max_size > 2)
22143 dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset);
22144 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
22145 offset += 2;
22147 if ((countval & 0x01) && max_size > 1)
22149 dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset);
22150 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
22151 offset += 1;
22153 return;
22155 if (max_size > 32)
22157 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
22158 return;
22160 if (max_size > 16)
22162 rtx label = ix86_expand_aligntest (count, 16, true);
22163 if (TARGET_64BIT)
22165 dest = change_address (destmem, DImode, destptr);
22166 emit_insn (gen_strset (destptr, dest, value));
22167 emit_insn (gen_strset (destptr, dest, value));
22169 else
22171 dest = change_address (destmem, SImode, destptr);
22172 emit_insn (gen_strset (destptr, dest, value));
22173 emit_insn (gen_strset (destptr, dest, value));
22174 emit_insn (gen_strset (destptr, dest, value));
22175 emit_insn (gen_strset (destptr, dest, value));
22177 emit_label (label);
22178 LABEL_NUSES (label) = 1;
22180 if (max_size > 8)
22182 rtx label = ix86_expand_aligntest (count, 8, true);
22183 if (TARGET_64BIT)
22185 dest = change_address (destmem, DImode, destptr);
22186 emit_insn (gen_strset (destptr, dest, value));
22188 else
22190 dest = change_address (destmem, SImode, destptr);
22191 emit_insn (gen_strset (destptr, dest, value));
22192 emit_insn (gen_strset (destptr, dest, value));
22194 emit_label (label);
22195 LABEL_NUSES (label) = 1;
22197 if (max_size > 4)
22199 rtx label = ix86_expand_aligntest (count, 4, true);
22200 dest = change_address (destmem, SImode, destptr);
22201 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
22202 emit_label (label);
22203 LABEL_NUSES (label) = 1;
22205 if (max_size > 2)
22207 rtx label = ix86_expand_aligntest (count, 2, true);
22208 dest = change_address (destmem, HImode, destptr);
22209 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
22210 emit_label (label);
22211 LABEL_NUSES (label) = 1;
22213 if (max_size > 1)
22215 rtx label = ix86_expand_aligntest (count, 1, true);
22216 dest = change_address (destmem, QImode, destptr);
22217 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
22218 emit_label (label);
22219 LABEL_NUSES (label) = 1;
22223 /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
22224 DESIRED_ALIGNMENT. */
22225 static void
22226 expand_movmem_prologue (rtx destmem, rtx srcmem,
22227 rtx destptr, rtx srcptr, rtx count,
22228 int align, int desired_alignment)
22230 if (align <= 1 && desired_alignment > 1)
22232 rtx label = ix86_expand_aligntest (destptr, 1, false);
22233 srcmem = change_address (srcmem, QImode, srcptr);
22234 destmem = change_address (destmem, QImode, destptr);
22235 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
22236 ix86_adjust_counter (count, 1);
22237 emit_label (label);
22238 LABEL_NUSES (label) = 1;
22240 if (align <= 2 && desired_alignment > 2)
22242 rtx label = ix86_expand_aligntest (destptr, 2, false);
22243 srcmem = change_address (srcmem, HImode, srcptr);
22244 destmem = change_address (destmem, HImode, destptr);
22245 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
22246 ix86_adjust_counter (count, 2);
22247 emit_label (label);
22248 LABEL_NUSES (label) = 1;
22250 if (align <= 4 && desired_alignment > 4)
22252 rtx label = ix86_expand_aligntest (destptr, 4, false);
22253 srcmem = change_address (srcmem, SImode, srcptr);
22254 destmem = change_address (destmem, SImode, destptr);
22255 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
22256 ix86_adjust_counter (count, 4);
22257 emit_label (label);
22258 LABEL_NUSES (label) = 1;
22260 gcc_assert (desired_alignment <= 8);
22263 /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
22264 ALIGN_BYTES is how many bytes need to be copied. */
22265 static rtx
22266 expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
22267 int desired_align, int align_bytes)
22269 rtx src = *srcp;
22270 rtx orig_dst = dst;
22271 rtx orig_src = src;
22272 int off = 0;
22273 int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
22274 if (src_align_bytes >= 0)
22275 src_align_bytes = desired_align - src_align_bytes;
22276 if (align_bytes & 1)
22278 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
22279 src = adjust_automodify_address_nv (src, QImode, srcreg, 0);
22280 off = 1;
22281 emit_insn (gen_strmov (destreg, dst, srcreg, src));
22283 if (align_bytes & 2)
22285 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
22286 src = adjust_automodify_address_nv (src, HImode, srcreg, off);
22287 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
22288 set_mem_align (dst, 2 * BITS_PER_UNIT);
22289 if (src_align_bytes >= 0
22290 && (src_align_bytes & 1) == (align_bytes & 1)
22291 && MEM_ALIGN (src) < 2 * BITS_PER_UNIT)
22292 set_mem_align (src, 2 * BITS_PER_UNIT);
22293 off = 2;
22294 emit_insn (gen_strmov (destreg, dst, srcreg, src));
22296 if (align_bytes & 4)
22298 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
22299 src = adjust_automodify_address_nv (src, SImode, srcreg, off);
22300 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
22301 set_mem_align (dst, 4 * BITS_PER_UNIT);
22302 if (src_align_bytes >= 0)
22304 unsigned int src_align = 0;
22305 if ((src_align_bytes & 3) == (align_bytes & 3))
22306 src_align = 4;
22307 else if ((src_align_bytes & 1) == (align_bytes & 1))
22308 src_align = 2;
22309 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
22310 set_mem_align (src, src_align * BITS_PER_UNIT);
22312 off = 4;
22313 emit_insn (gen_strmov (destreg, dst, srcreg, src));
22315 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
22316 src = adjust_automodify_address_nv (src, BLKmode, srcreg, off);
22317 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
22318 set_mem_align (dst, desired_align * BITS_PER_UNIT);
22319 if (src_align_bytes >= 0)
22321 unsigned int src_align = 0;
22322 if ((src_align_bytes & 7) == (align_bytes & 7))
22323 src_align = 8;
22324 else if ((src_align_bytes & 3) == (align_bytes & 3))
22325 src_align = 4;
22326 else if ((src_align_bytes & 1) == (align_bytes & 1))
22327 src_align = 2;
22328 if (src_align > (unsigned int) desired_align)
22329 src_align = desired_align;
22330 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
22331 set_mem_align (src, src_align * BITS_PER_UNIT);
22333 if (MEM_SIZE_KNOWN_P (orig_dst))
22334 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
22335 if (MEM_SIZE_KNOWN_P (orig_src))
22336 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
22337 *srcp = src;
22338 return dst;
22341 /* Set enough from DEST to align DEST known to by aligned by ALIGN to
22342 DESIRED_ALIGNMENT. */
22343 static void
22344 expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
22345 int align, int desired_alignment)
22347 if (align <= 1 && desired_alignment > 1)
22349 rtx label = ix86_expand_aligntest (destptr, 1, false);
22350 destmem = change_address (destmem, QImode, destptr);
22351 emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
22352 ix86_adjust_counter (count, 1);
22353 emit_label (label);
22354 LABEL_NUSES (label) = 1;
22356 if (align <= 2 && desired_alignment > 2)
22358 rtx label = ix86_expand_aligntest (destptr, 2, false);
22359 destmem = change_address (destmem, HImode, destptr);
22360 emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
22361 ix86_adjust_counter (count, 2);
22362 emit_label (label);
22363 LABEL_NUSES (label) = 1;
22365 if (align <= 4 && desired_alignment > 4)
22367 rtx label = ix86_expand_aligntest (destptr, 4, false);
22368 destmem = change_address (destmem, SImode, destptr);
22369 emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
22370 ix86_adjust_counter (count, 4);
22371 emit_label (label);
22372 LABEL_NUSES (label) = 1;
22374 gcc_assert (desired_alignment <= 8);
22377 /* Set enough from DST to align DST known to by aligned by ALIGN to
22378 DESIRED_ALIGN. ALIGN_BYTES is how many bytes need to be stored. */
22379 static rtx
22380 expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
22381 int desired_align, int align_bytes)
22383 int off = 0;
22384 rtx orig_dst = dst;
22385 if (align_bytes & 1)
22387 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
22388 off = 1;
22389 emit_insn (gen_strset (destreg, dst,
22390 gen_lowpart (QImode, value)));
22392 if (align_bytes & 2)
22394 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
22395 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
22396 set_mem_align (dst, 2 * BITS_PER_UNIT);
22397 off = 2;
22398 emit_insn (gen_strset (destreg, dst,
22399 gen_lowpart (HImode, value)));
22401 if (align_bytes & 4)
22403 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
22404 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
22405 set_mem_align (dst, 4 * BITS_PER_UNIT);
22406 off = 4;
22407 emit_insn (gen_strset (destreg, dst,
22408 gen_lowpart (SImode, value)));
22410 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
22411 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
22412 set_mem_align (dst, desired_align * BITS_PER_UNIT);
22413 if (MEM_SIZE_KNOWN_P (orig_dst))
22414 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
22415 return dst;
22418 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
22419 static enum stringop_alg
22420 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
22421 int *dynamic_check, bool *noalign)
22423 const struct stringop_algs * algs;
22424 bool optimize_for_speed;
22425 /* Algorithms using the rep prefix want at least edi and ecx;
22426 additionally, memset wants eax and memcpy wants esi. Don't
22427 consider such algorithms if the user has appropriated those
22428 registers for their own purposes. */
22429 bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
22430 || (memset
22431 ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
22432 *noalign = false;
22434 #define ALG_USABLE_P(alg) (rep_prefix_usable \
22435 || (alg != rep_prefix_1_byte \
22436 && alg != rep_prefix_4_byte \
22437 && alg != rep_prefix_8_byte))
22438 const struct processor_costs *cost;
22440 /* Even if the string operation call is cold, we still might spend a lot
22441 of time processing large blocks. */
22442 if (optimize_function_for_size_p (cfun)
22443 || (optimize_insn_for_size_p ()
22444 && expected_size != -1 && expected_size < 256))
22445 optimize_for_speed = false;
22446 else
22447 optimize_for_speed = true;
22449 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
22451 *dynamic_check = -1;
22452 if (memset)
22453 algs = &cost->memset[TARGET_64BIT != 0];
22454 else
22455 algs = &cost->memcpy[TARGET_64BIT != 0];
22456 if (ix86_stringop_alg != no_stringop && ALG_USABLE_P (ix86_stringop_alg))
22457 return ix86_stringop_alg;
22458 /* rep; movq or rep; movl is the smallest variant. */
22459 else if (!optimize_for_speed)
22461 if (!count || (count & 3))
22462 return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
22463 else
22464 return rep_prefix_usable ? rep_prefix_4_byte : loop;
22466 /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
22468 else if (expected_size != -1 && expected_size < 4)
22469 return loop_1_byte;
22470 else if (expected_size != -1)
22472 unsigned int i;
22473 enum stringop_alg alg = libcall;
22474 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22476 /* We get here if the algorithms that were not libcall-based
22477 were rep-prefix based and we are unable to use rep prefixes
22478 based on global register usage. Break out of the loop and
22479 use the heuristic below. */
22480 if (algs->size[i].max == 0)
22481 break;
22482 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
22484 enum stringop_alg candidate = algs->size[i].alg;
22486 if (candidate != libcall && ALG_USABLE_P (candidate))
22487 alg = candidate;
22488 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
22489 last non-libcall inline algorithm. */
22490 if (TARGET_INLINE_ALL_STRINGOPS)
22492 /* When the current size is best to be copied by a libcall,
22493 but we are still forced to inline, run the heuristic below
22494 that will pick code for medium sized blocks. */
22495 if (alg != libcall)
22496 return alg;
22497 break;
22499 else if (ALG_USABLE_P (candidate))
22501 *noalign = algs->size[i].noalign;
22502 return candidate;
22506 gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
22508 /* When asked to inline the call anyway, try to pick meaningful choice.
22509 We look for maximal size of block that is faster to copy by hand and
22510 take blocks of at most of that size guessing that average size will
22511 be roughly half of the block.
22513 If this turns out to be bad, we might simply specify the preferred
22514 choice in ix86_costs. */
22515 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22516 && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size)))
22518 int max = -1;
22519 enum stringop_alg alg;
22520 int i;
22521 bool any_alg_usable_p = true;
22523 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22525 enum stringop_alg candidate = algs->size[i].alg;
22526 any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
22528 if (candidate != libcall && candidate
22529 && ALG_USABLE_P (candidate))
22530 max = algs->size[i].max;
22532 /* If there aren't any usable algorithms, then recursing on
22533 smaller sizes isn't going to find anything. Just return the
22534 simple byte-at-a-time copy loop. */
22535 if (!any_alg_usable_p)
22537 /* Pick something reasonable. */
22538 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22539 *dynamic_check = 128;
22540 return loop_1_byte;
22542 if (max == -1)
22543 max = 4096;
22544 alg = decide_alg (count, max / 2, memset, dynamic_check, noalign);
22545 gcc_assert (*dynamic_check == -1);
22546 gcc_assert (alg != libcall);
22547 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22548 *dynamic_check = max;
22549 return alg;
22551 return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
22552 #undef ALG_USABLE_P
22555 /* Decide on alignment. We know that the operand is already aligned to ALIGN
22556 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
22557 static int
22558 decide_alignment (int align,
22559 enum stringop_alg alg,
22560 int expected_size)
22562 int desired_align = 0;
22563 switch (alg)
22565 case no_stringop:
22566 gcc_unreachable ();
22567 case loop:
22568 case unrolled_loop:
22569 desired_align = GET_MODE_SIZE (Pmode);
22570 break;
22571 case rep_prefix_8_byte:
22572 desired_align = 8;
22573 break;
22574 case rep_prefix_4_byte:
22575 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22576 copying whole cacheline at once. */
22577 if (TARGET_PENTIUMPRO)
22578 desired_align = 8;
22579 else
22580 desired_align = 4;
22581 break;
22582 case rep_prefix_1_byte:
22583 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22584 copying whole cacheline at once. */
22585 if (TARGET_PENTIUMPRO)
22586 desired_align = 8;
22587 else
22588 desired_align = 1;
22589 break;
22590 case loop_1_byte:
22591 desired_align = 1;
22592 break;
22593 case libcall:
22594 return 0;
22597 if (optimize_size)
22598 desired_align = 1;
22599 if (desired_align < align)
22600 desired_align = align;
22601 if (expected_size != -1 && expected_size < 4)
22602 desired_align = align;
22603 return desired_align;
22606 /* Return the smallest power of 2 greater than VAL. */
22607 static int
22608 smallest_pow2_greater_than (int val)
22610 int ret = 1;
22611 while (ret <= val)
22612 ret <<= 1;
22613 return ret;
22616 /* Expand string move (memcpy) operation. Use i386 string operations
22617 when profitable. expand_setmem contains similar code. The code
22618 depends upon architecture, block size and alignment, but always has
22619 the same overall structure:
22621 1) Prologue guard: Conditional that jumps up to epilogues for small
22622 blocks that can be handled by epilogue alone. This is faster
22623 but also needed for correctness, since prologue assume the block
22624 is larger than the desired alignment.
22626 Optional dynamic check for size and libcall for large
22627 blocks is emitted here too, with -minline-stringops-dynamically.
22629 2) Prologue: copy first few bytes in order to get destination
22630 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
22631 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
22632 copied. We emit either a jump tree on power of two sized
22633 blocks, or a byte loop.
22635 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
22636 with specified algorithm.
22638 4) Epilogue: code copying tail of the block that is too small to be
22639 handled by main body (or up to size guarded by prologue guard). */
22641 bool
22642 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
22643 rtx expected_align_exp, rtx expected_size_exp)
22645 rtx destreg;
22646 rtx srcreg;
22647 rtx label = NULL;
22648 rtx tmp;
22649 rtx jump_around_label = NULL;
22650 HOST_WIDE_INT align = 1;
22651 unsigned HOST_WIDE_INT count = 0;
22652 HOST_WIDE_INT expected_size = -1;
22653 int size_needed = 0, epilogue_size_needed;
22654 int desired_align = 0, align_bytes = 0;
22655 enum stringop_alg alg;
22656 int dynamic_check;
22657 bool need_zero_guard = false;
22658 bool noalign;
22660 if (CONST_INT_P (align_exp))
22661 align = INTVAL (align_exp);
22662 /* i386 can do misaligned access on reasonably increased cost. */
22663 if (CONST_INT_P (expected_align_exp)
22664 && INTVAL (expected_align_exp) > align)
22665 align = INTVAL (expected_align_exp);
22666 /* ALIGN is the minimum of destination and source alignment, but we care here
22667 just about destination alignment. */
22668 else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
22669 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
22671 if (CONST_INT_P (count_exp))
22672 count = expected_size = INTVAL (count_exp);
22673 if (CONST_INT_P (expected_size_exp) && count == 0)
22674 expected_size = INTVAL (expected_size_exp);
22676 /* Make sure we don't need to care about overflow later on. */
22677 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22678 return false;
22680 /* Step 0: Decide on preferred algorithm, desired alignment and
22681 size of chunks to be copied by main loop. */
22683 alg = decide_alg (count, expected_size, false, &dynamic_check, &noalign);
22684 desired_align = decide_alignment (align, alg, expected_size);
22686 if (!TARGET_ALIGN_STRINGOPS || noalign)
22687 align = desired_align;
22689 if (alg == libcall)
22690 return false;
22691 gcc_assert (alg != no_stringop);
22692 if (!count)
22693 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
22694 destreg = copy_addr_to_reg (XEXP (dst, 0));
22695 srcreg = copy_addr_to_reg (XEXP (src, 0));
22696 switch (alg)
22698 case libcall:
22699 case no_stringop:
22700 gcc_unreachable ();
22701 case loop:
22702 need_zero_guard = true;
22703 size_needed = GET_MODE_SIZE (word_mode);
22704 break;
22705 case unrolled_loop:
22706 need_zero_guard = true;
22707 size_needed = GET_MODE_SIZE (word_mode) * (TARGET_64BIT ? 4 : 2);
22708 break;
22709 case rep_prefix_8_byte:
22710 size_needed = 8;
22711 break;
22712 case rep_prefix_4_byte:
22713 size_needed = 4;
22714 break;
22715 case rep_prefix_1_byte:
22716 size_needed = 1;
22717 break;
22718 case loop_1_byte:
22719 need_zero_guard = true;
22720 size_needed = 1;
22721 break;
22724 epilogue_size_needed = size_needed;
22726 /* Step 1: Prologue guard. */
22728 /* Alignment code needs count to be in register. */
22729 if (CONST_INT_P (count_exp) && desired_align > align)
22731 if (INTVAL (count_exp) > desired_align
22732 && INTVAL (count_exp) > size_needed)
22734 align_bytes
22735 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22736 if (align_bytes <= 0)
22737 align_bytes = 0;
22738 else
22739 align_bytes = desired_align - align_bytes;
22741 if (align_bytes == 0)
22742 count_exp = force_reg (counter_mode (count_exp), count_exp);
22744 gcc_assert (desired_align >= 1 && align >= 1);
22746 /* Ensure that alignment prologue won't copy past end of block. */
22747 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22749 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22750 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
22751 Make sure it is power of 2. */
22752 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22754 if (count)
22756 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22758 /* If main algorithm works on QImode, no epilogue is needed.
22759 For small sizes just don't align anything. */
22760 if (size_needed == 1)
22761 desired_align = align;
22762 else
22763 goto epilogue;
22766 else
22768 label = gen_label_rtx ();
22769 emit_cmp_and_jump_insns (count_exp,
22770 GEN_INT (epilogue_size_needed),
22771 LTU, 0, counter_mode (count_exp), 1, label);
22772 if (expected_size == -1 || expected_size < epilogue_size_needed)
22773 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22774 else
22775 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22779 /* Emit code to decide on runtime whether library call or inline should be
22780 used. */
22781 if (dynamic_check != -1)
22783 if (CONST_INT_P (count_exp))
22785 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
22787 emit_block_move_via_libcall (dst, src, count_exp, false);
22788 count_exp = const0_rtx;
22789 goto epilogue;
22792 else
22794 rtx hot_label = gen_label_rtx ();
22795 jump_around_label = gen_label_rtx ();
22796 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22797 LEU, 0, GET_MODE (count_exp), 1, hot_label);
22798 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22799 emit_block_move_via_libcall (dst, src, count_exp, false);
22800 emit_jump (jump_around_label);
22801 emit_label (hot_label);
22805 /* Step 2: Alignment prologue. */
22807 if (desired_align > align)
22809 if (align_bytes == 0)
22811 /* Except for the first move in epilogue, we no longer know
22812 constant offset in aliasing info. It don't seems to worth
22813 the pain to maintain it for the first move, so throw away
22814 the info early. */
22815 src = change_address (src, BLKmode, srcreg);
22816 dst = change_address (dst, BLKmode, destreg);
22817 expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
22818 desired_align);
22820 else
22822 /* If we know how many bytes need to be stored before dst is
22823 sufficiently aligned, maintain aliasing info accurately. */
22824 dst = expand_constant_movmem_prologue (dst, &src, destreg, srcreg,
22825 desired_align, align_bytes);
22826 count_exp = plus_constant (counter_mode (count_exp),
22827 count_exp, -align_bytes);
22828 count -= align_bytes;
22830 if (need_zero_guard
22831 && (count < (unsigned HOST_WIDE_INT) size_needed
22832 || (align_bytes == 0
22833 && count < ((unsigned HOST_WIDE_INT) size_needed
22834 + desired_align - align))))
22836 /* It is possible that we copied enough so the main loop will not
22837 execute. */
22838 gcc_assert (size_needed > 1);
22839 if (label == NULL_RTX)
22840 label = gen_label_rtx ();
22841 emit_cmp_and_jump_insns (count_exp,
22842 GEN_INT (size_needed),
22843 LTU, 0, counter_mode (count_exp), 1, label);
22844 if (expected_size == -1
22845 || expected_size < (desired_align - align) / 2 + size_needed)
22846 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22847 else
22848 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22851 if (label && size_needed == 1)
22853 emit_label (label);
22854 LABEL_NUSES (label) = 1;
22855 label = NULL;
22856 epilogue_size_needed = 1;
22858 else if (label == NULL_RTX)
22859 epilogue_size_needed = size_needed;
22861 /* Step 3: Main loop. */
22863 switch (alg)
22865 case libcall:
22866 case no_stringop:
22867 gcc_unreachable ();
22868 case loop_1_byte:
22869 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22870 count_exp, QImode, 1, expected_size);
22871 break;
22872 case loop:
22873 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22874 count_exp, word_mode, 1, expected_size);
22875 break;
22876 case unrolled_loop:
22877 /* Unroll only by factor of 2 in 32bit mode, since we don't have enough
22878 registers for 4 temporaries anyway. */
22879 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22880 count_exp, word_mode, TARGET_64BIT ? 4 : 2,
22881 expected_size);
22882 break;
22883 case rep_prefix_8_byte:
22884 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22885 DImode);
22886 break;
22887 case rep_prefix_4_byte:
22888 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22889 SImode);
22890 break;
22891 case rep_prefix_1_byte:
22892 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22893 QImode);
22894 break;
22896 /* Adjust properly the offset of src and dest memory for aliasing. */
22897 if (CONST_INT_P (count_exp))
22899 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
22900 (count / size_needed) * size_needed);
22901 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22902 (count / size_needed) * size_needed);
22904 else
22906 src = change_address (src, BLKmode, srcreg);
22907 dst = change_address (dst, BLKmode, destreg);
22910 /* Step 4: Epilogue to copy the remaining bytes. */
22911 epilogue:
22912 if (label)
22914 /* When the main loop is done, COUNT_EXP might hold original count,
22915 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22916 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22917 bytes. Compensate if needed. */
22919 if (size_needed < epilogue_size_needed)
22921 tmp =
22922 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22923 GEN_INT (size_needed - 1), count_exp, 1,
22924 OPTAB_DIRECT);
22925 if (tmp != count_exp)
22926 emit_move_insn (count_exp, tmp);
22928 emit_label (label);
22929 LABEL_NUSES (label) = 1;
22932 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22933 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
22934 epilogue_size_needed);
22935 if (jump_around_label)
22936 emit_label (jump_around_label);
22937 return true;
22940 /* Helper function for memcpy. For QImode value 0xXY produce
22941 0xXYXYXYXY of wide specified by MODE. This is essentially
22942 a * 0x10101010, but we can do slightly better than
22943 synth_mult by unwinding the sequence by hand on CPUs with
22944 slow multiply. */
22945 static rtx
22946 promote_duplicated_reg (enum machine_mode mode, rtx val)
22948 enum machine_mode valmode = GET_MODE (val);
22949 rtx tmp;
22950 int nops = mode == DImode ? 3 : 2;
22952 gcc_assert (mode == SImode || mode == DImode);
22953 if (val == const0_rtx)
22954 return copy_to_mode_reg (mode, const0_rtx);
22955 if (CONST_INT_P (val))
22957 HOST_WIDE_INT v = INTVAL (val) & 255;
22959 v |= v << 8;
22960 v |= v << 16;
22961 if (mode == DImode)
22962 v |= (v << 16) << 16;
22963 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
22966 if (valmode == VOIDmode)
22967 valmode = QImode;
22968 if (valmode != QImode)
22969 val = gen_lowpart (QImode, val);
22970 if (mode == QImode)
22971 return val;
22972 if (!TARGET_PARTIAL_REG_STALL)
22973 nops--;
22974 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
22975 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
22976 <= (ix86_cost->shift_const + ix86_cost->add) * nops
22977 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
22979 rtx reg = convert_modes (mode, QImode, val, true);
22980 tmp = promote_duplicated_reg (mode, const1_rtx);
22981 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
22982 OPTAB_DIRECT);
22984 else
22986 rtx reg = convert_modes (mode, QImode, val, true);
22988 if (!TARGET_PARTIAL_REG_STALL)
22989 if (mode == SImode)
22990 emit_insn (gen_movsi_insv_1 (reg, reg));
22991 else
22992 emit_insn (gen_movdi_insv_1 (reg, reg));
22993 else
22995 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
22996 NULL, 1, OPTAB_DIRECT);
22997 reg =
22998 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23000 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
23001 NULL, 1, OPTAB_DIRECT);
23002 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23003 if (mode == SImode)
23004 return reg;
23005 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
23006 NULL, 1, OPTAB_DIRECT);
23007 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23008 return reg;
23012 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
23013 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
23014 alignment from ALIGN to DESIRED_ALIGN. */
23015 static rtx
23016 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align)
23018 rtx promoted_val;
23020 if (TARGET_64BIT
23021 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
23022 promoted_val = promote_duplicated_reg (DImode, val);
23023 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
23024 promoted_val = promote_duplicated_reg (SImode, val);
23025 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
23026 promoted_val = promote_duplicated_reg (HImode, val);
23027 else
23028 promoted_val = val;
23030 return promoted_val;
23033 /* Expand string clear operation (bzero). Use i386 string operations when
23034 profitable. See expand_movmem comment for explanation of individual
23035 steps performed. */
23036 bool
23037 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
23038 rtx expected_align_exp, rtx expected_size_exp)
23040 rtx destreg;
23041 rtx label = NULL;
23042 rtx tmp;
23043 rtx jump_around_label = NULL;
23044 HOST_WIDE_INT align = 1;
23045 unsigned HOST_WIDE_INT count = 0;
23046 HOST_WIDE_INT expected_size = -1;
23047 int size_needed = 0, epilogue_size_needed;
23048 int desired_align = 0, align_bytes = 0;
23049 enum stringop_alg alg;
23050 rtx promoted_val = NULL;
23051 bool force_loopy_epilogue = false;
23052 int dynamic_check;
23053 bool need_zero_guard = false;
23054 bool noalign;
23056 if (CONST_INT_P (align_exp))
23057 align = INTVAL (align_exp);
23058 /* i386 can do misaligned access on reasonably increased cost. */
23059 if (CONST_INT_P (expected_align_exp)
23060 && INTVAL (expected_align_exp) > align)
23061 align = INTVAL (expected_align_exp);
23062 if (CONST_INT_P (count_exp))
23063 count = expected_size = INTVAL (count_exp);
23064 if (CONST_INT_P (expected_size_exp) && count == 0)
23065 expected_size = INTVAL (expected_size_exp);
23067 /* Make sure we don't need to care about overflow later on. */
23068 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
23069 return false;
23071 /* Step 0: Decide on preferred algorithm, desired alignment and
23072 size of chunks to be copied by main loop. */
23074 alg = decide_alg (count, expected_size, true, &dynamic_check, &noalign);
23075 desired_align = decide_alignment (align, alg, expected_size);
23077 if (!TARGET_ALIGN_STRINGOPS || noalign)
23078 align = desired_align;
23080 if (alg == libcall)
23081 return false;
23082 gcc_assert (alg != no_stringop);
23083 if (!count)
23084 count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp);
23085 destreg = copy_addr_to_reg (XEXP (dst, 0));
23086 switch (alg)
23088 case libcall:
23089 case no_stringop:
23090 gcc_unreachable ();
23091 case loop:
23092 need_zero_guard = true;
23093 size_needed = GET_MODE_SIZE (word_mode);
23094 break;
23095 case unrolled_loop:
23096 need_zero_guard = true;
23097 size_needed = GET_MODE_SIZE (word_mode) * 4;
23098 break;
23099 case rep_prefix_8_byte:
23100 size_needed = 8;
23101 break;
23102 case rep_prefix_4_byte:
23103 size_needed = 4;
23104 break;
23105 case rep_prefix_1_byte:
23106 size_needed = 1;
23107 break;
23108 case loop_1_byte:
23109 need_zero_guard = true;
23110 size_needed = 1;
23111 break;
23113 epilogue_size_needed = size_needed;
23115 /* Step 1: Prologue guard. */
23117 /* Alignment code needs count to be in register. */
23118 if (CONST_INT_P (count_exp) && desired_align > align)
23120 if (INTVAL (count_exp) > desired_align
23121 && INTVAL (count_exp) > size_needed)
23123 align_bytes
23124 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
23125 if (align_bytes <= 0)
23126 align_bytes = 0;
23127 else
23128 align_bytes = desired_align - align_bytes;
23130 if (align_bytes == 0)
23132 enum machine_mode mode = SImode;
23133 if (TARGET_64BIT && (count & ~0xffffffff))
23134 mode = DImode;
23135 count_exp = force_reg (mode, count_exp);
23138 /* Do the cheap promotion to allow better CSE across the
23139 main loop and epilogue (ie one load of the big constant in the
23140 front of all code. */
23141 if (CONST_INT_P (val_exp))
23142 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
23143 desired_align, align);
23144 /* Ensure that alignment prologue won't copy past end of block. */
23145 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
23147 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
23148 /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
23149 Make sure it is power of 2. */
23150 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
23152 /* To improve performance of small blocks, we jump around the VAL
23153 promoting mode. This mean that if the promoted VAL is not constant,
23154 we might not use it in the epilogue and have to use byte
23155 loop variant. */
23156 if (epilogue_size_needed > 2 && !promoted_val)
23157 force_loopy_epilogue = true;
23158 if (count)
23160 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
23162 /* If main algorithm works on QImode, no epilogue is needed.
23163 For small sizes just don't align anything. */
23164 if (size_needed == 1)
23165 desired_align = align;
23166 else
23167 goto epilogue;
23170 else
23172 label = gen_label_rtx ();
23173 emit_cmp_and_jump_insns (count_exp,
23174 GEN_INT (epilogue_size_needed),
23175 LTU, 0, counter_mode (count_exp), 1, label);
23176 if (expected_size == -1 || expected_size <= epilogue_size_needed)
23177 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23178 else
23179 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23182 if (dynamic_check != -1)
23184 rtx hot_label = gen_label_rtx ();
23185 jump_around_label = gen_label_rtx ();
23186 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
23187 LEU, 0, counter_mode (count_exp), 1, hot_label);
23188 predict_jump (REG_BR_PROB_BASE * 90 / 100);
23189 set_storage_via_libcall (dst, count_exp, val_exp, false);
23190 emit_jump (jump_around_label);
23191 emit_label (hot_label);
23194 /* Step 2: Alignment prologue. */
23196 /* Do the expensive promotion once we branched off the small blocks. */
23197 if (!promoted_val)
23198 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
23199 desired_align, align);
23200 gcc_assert (desired_align >= 1 && align >= 1);
23202 if (desired_align > align)
23204 if (align_bytes == 0)
23206 /* Except for the first move in epilogue, we no longer know
23207 constant offset in aliasing info. It don't seems to worth
23208 the pain to maintain it for the first move, so throw away
23209 the info early. */
23210 dst = change_address (dst, BLKmode, destreg);
23211 expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
23212 desired_align);
23214 else
23216 /* If we know how many bytes need to be stored before dst is
23217 sufficiently aligned, maintain aliasing info accurately. */
23218 dst = expand_constant_setmem_prologue (dst, destreg, promoted_val,
23219 desired_align, align_bytes);
23220 count_exp = plus_constant (counter_mode (count_exp),
23221 count_exp, -align_bytes);
23222 count -= align_bytes;
23224 if (need_zero_guard
23225 && (count < (unsigned HOST_WIDE_INT) size_needed
23226 || (align_bytes == 0
23227 && count < ((unsigned HOST_WIDE_INT) size_needed
23228 + desired_align - align))))
23230 /* It is possible that we copied enough so the main loop will not
23231 execute. */
23232 gcc_assert (size_needed > 1);
23233 if (label == NULL_RTX)
23234 label = gen_label_rtx ();
23235 emit_cmp_and_jump_insns (count_exp,
23236 GEN_INT (size_needed),
23237 LTU, 0, counter_mode (count_exp), 1, label);
23238 if (expected_size == -1
23239 || expected_size < (desired_align - align) / 2 + size_needed)
23240 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23241 else
23242 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23245 if (label && size_needed == 1)
23247 emit_label (label);
23248 LABEL_NUSES (label) = 1;
23249 label = NULL;
23250 promoted_val = val_exp;
23251 epilogue_size_needed = 1;
23253 else if (label == NULL_RTX)
23254 epilogue_size_needed = size_needed;
23256 /* Step 3: Main loop. */
23258 switch (alg)
23260 case libcall:
23261 case no_stringop:
23262 gcc_unreachable ();
23263 case loop_1_byte:
23264 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23265 count_exp, QImode, 1, expected_size);
23266 break;
23267 case loop:
23268 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23269 count_exp, word_mode, 1, expected_size);
23270 break;
23271 case unrolled_loop:
23272 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23273 count_exp, word_mode, 4, expected_size);
23274 break;
23275 case rep_prefix_8_byte:
23276 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23277 DImode, val_exp);
23278 break;
23279 case rep_prefix_4_byte:
23280 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23281 SImode, val_exp);
23282 break;
23283 case rep_prefix_1_byte:
23284 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23285 QImode, val_exp);
23286 break;
23288 /* Adjust properly the offset of src and dest memory for aliasing. */
23289 if (CONST_INT_P (count_exp))
23290 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
23291 (count / size_needed) * size_needed);
23292 else
23293 dst = change_address (dst, BLKmode, destreg);
23295 /* Step 4: Epilogue to copy the remaining bytes. */
23297 if (label)
23299 /* When the main loop is done, COUNT_EXP might hold original count,
23300 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
23301 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
23302 bytes. Compensate if needed. */
23304 if (size_needed < epilogue_size_needed)
23306 tmp =
23307 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
23308 GEN_INT (size_needed - 1), count_exp, 1,
23309 OPTAB_DIRECT);
23310 if (tmp != count_exp)
23311 emit_move_insn (count_exp, tmp);
23313 emit_label (label);
23314 LABEL_NUSES (label) = 1;
23316 epilogue:
23317 if (count_exp != const0_rtx && epilogue_size_needed > 1)
23319 if (force_loopy_epilogue)
23320 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
23321 epilogue_size_needed);
23322 else
23323 expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
23324 epilogue_size_needed);
23326 if (jump_around_label)
23327 emit_label (jump_around_label);
23328 return true;
23331 /* Expand the appropriate insns for doing strlen if not just doing
23332 repnz; scasb
23334 out = result, initialized with the start address
23335 align_rtx = alignment of the address.
23336 scratch = scratch register, initialized with the startaddress when
23337 not aligned, otherwise undefined
23339 This is just the body. It needs the initializations mentioned above and
23340 some address computing at the end. These things are done in i386.md. */
23342 static void
23343 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
23345 int align;
23346 rtx tmp;
23347 rtx align_2_label = NULL_RTX;
23348 rtx align_3_label = NULL_RTX;
23349 rtx align_4_label = gen_label_rtx ();
23350 rtx end_0_label = gen_label_rtx ();
23351 rtx mem;
23352 rtx tmpreg = gen_reg_rtx (SImode);
23353 rtx scratch = gen_reg_rtx (SImode);
23354 rtx cmp;
23356 align = 0;
23357 if (CONST_INT_P (align_rtx))
23358 align = INTVAL (align_rtx);
23360 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
23362 /* Is there a known alignment and is it less than 4? */
23363 if (align < 4)
23365 rtx scratch1 = gen_reg_rtx (Pmode);
23366 emit_move_insn (scratch1, out);
23367 /* Is there a known alignment and is it not 2? */
23368 if (align != 2)
23370 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
23371 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
23373 /* Leave just the 3 lower bits. */
23374 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
23375 NULL_RTX, 0, OPTAB_WIDEN);
23377 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
23378 Pmode, 1, align_4_label);
23379 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
23380 Pmode, 1, align_2_label);
23381 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
23382 Pmode, 1, align_3_label);
23384 else
23386 /* Since the alignment is 2, we have to check 2 or 0 bytes;
23387 check if is aligned to 4 - byte. */
23389 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
23390 NULL_RTX, 0, OPTAB_WIDEN);
23392 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
23393 Pmode, 1, align_4_label);
23396 mem = change_address (src, QImode, out);
23398 /* Now compare the bytes. */
23400 /* Compare the first n unaligned byte on a byte per byte basis. */
23401 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
23402 QImode, 1, end_0_label);
23404 /* Increment the address. */
23405 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23407 /* Not needed with an alignment of 2 */
23408 if (align != 2)
23410 emit_label (align_2_label);
23412 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23413 end_0_label);
23415 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23417 emit_label (align_3_label);
23420 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23421 end_0_label);
23423 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23426 /* Generate loop to check 4 bytes at a time. It is not a good idea to
23427 align this loop. It gives only huge programs, but does not help to
23428 speed up. */
23429 emit_label (align_4_label);
23431 mem = change_address (src, SImode, out);
23432 emit_move_insn (scratch, mem);
23433 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
23435 /* This formula yields a nonzero result iff one of the bytes is zero.
23436 This saves three branches inside loop and many cycles. */
23438 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
23439 emit_insn (gen_one_cmplsi2 (scratch, scratch));
23440 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
23441 emit_insn (gen_andsi3 (tmpreg, tmpreg,
23442 gen_int_mode (0x80808080, SImode)));
23443 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
23444 align_4_label);
23446 if (TARGET_CMOVE)
23448 rtx reg = gen_reg_rtx (SImode);
23449 rtx reg2 = gen_reg_rtx (Pmode);
23450 emit_move_insn (reg, tmpreg);
23451 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
23453 /* If zero is not in the first two bytes, move two bytes forward. */
23454 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23455 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23456 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23457 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
23458 gen_rtx_IF_THEN_ELSE (SImode, tmp,
23459 reg,
23460 tmpreg)));
23461 /* Emit lea manually to avoid clobbering of flags. */
23462 emit_insn (gen_rtx_SET (SImode, reg2,
23463 gen_rtx_PLUS (Pmode, out, const2_rtx)));
23465 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23466 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23467 emit_insn (gen_rtx_SET (VOIDmode, out,
23468 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
23469 reg2,
23470 out)));
23472 else
23474 rtx end_2_label = gen_label_rtx ();
23475 /* Is zero in the first two bytes? */
23477 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23478 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23479 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
23480 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23481 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
23482 pc_rtx);
23483 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
23484 JUMP_LABEL (tmp) = end_2_label;
23486 /* Not in the first two. Move two bytes forward. */
23487 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
23488 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
23490 emit_label (end_2_label);
23494 /* Avoid branch in fixing the byte. */
23495 tmpreg = gen_lowpart (QImode, tmpreg);
23496 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
23497 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
23498 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
23499 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
23501 emit_label (end_0_label);
23504 /* Expand strlen. */
23506 bool
23507 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
23509 rtx addr, scratch1, scratch2, scratch3, scratch4;
23511 /* The generic case of strlen expander is long. Avoid it's
23512 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
23514 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23515 && !TARGET_INLINE_ALL_STRINGOPS
23516 && !optimize_insn_for_size_p ()
23517 && (!CONST_INT_P (align) || INTVAL (align) < 4))
23518 return false;
23520 addr = force_reg (Pmode, XEXP (src, 0));
23521 scratch1 = gen_reg_rtx (Pmode);
23523 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23524 && !optimize_insn_for_size_p ())
23526 /* Well it seems that some optimizer does not combine a call like
23527 foo(strlen(bar), strlen(bar));
23528 when the move and the subtraction is done here. It does calculate
23529 the length just once when these instructions are done inside of
23530 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
23531 often used and I use one fewer register for the lifetime of
23532 output_strlen_unroll() this is better. */
23534 emit_move_insn (out, addr);
23536 ix86_expand_strlensi_unroll_1 (out, src, align);
23538 /* strlensi_unroll_1 returns the address of the zero at the end of
23539 the string, like memchr(), so compute the length by subtracting
23540 the start address. */
23541 emit_insn (ix86_gen_sub3 (out, out, addr));
23543 else
23545 rtx unspec;
23547 /* Can't use this if the user has appropriated eax, ecx, or edi. */
23548 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
23549 return false;
23551 scratch2 = gen_reg_rtx (Pmode);
23552 scratch3 = gen_reg_rtx (Pmode);
23553 scratch4 = force_reg (Pmode, constm1_rtx);
23555 emit_move_insn (scratch3, addr);
23556 eoschar = force_reg (QImode, eoschar);
23558 src = replace_equiv_address_nv (src, scratch3);
23560 /* If .md starts supporting :P, this can be done in .md. */
23561 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
23562 scratch4), UNSPEC_SCAS);
23563 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
23564 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
23565 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
23567 return true;
23570 /* For given symbol (function) construct code to compute address of it's PLT
23571 entry in large x86-64 PIC model. */
23572 static rtx
23573 construct_plt_address (rtx symbol)
23575 rtx tmp, unspec;
23577 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
23578 gcc_assert (ix86_cmodel == CM_LARGE_PIC);
23579 gcc_assert (Pmode == DImode);
23581 tmp = gen_reg_rtx (Pmode);
23582 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
23584 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
23585 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
23586 return tmp;
23590 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
23591 rtx callarg2,
23592 rtx pop, bool sibcall)
23594 /* We need to represent that SI and DI registers are clobbered
23595 by SYSV calls. */
23596 static int clobbered_registers[] = {
23597 XMM6_REG, XMM7_REG, XMM8_REG,
23598 XMM9_REG, XMM10_REG, XMM11_REG,
23599 XMM12_REG, XMM13_REG, XMM14_REG,
23600 XMM15_REG, SI_REG, DI_REG
23602 rtx vec[ARRAY_SIZE (clobbered_registers) + 3];
23603 rtx use = NULL, call;
23604 unsigned int vec_len;
23606 if (pop == const0_rtx)
23607 pop = NULL;
23608 gcc_assert (!TARGET_64BIT || !pop);
23610 if (TARGET_MACHO && !TARGET_64BIT)
23612 #if TARGET_MACHO
23613 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
23614 fnaddr = machopic_indirect_call_target (fnaddr);
23615 #endif
23617 else
23619 /* Static functions and indirect calls don't need the pic register. */
23620 if (flag_pic && (!TARGET_64BIT || ix86_cmodel == CM_LARGE_PIC)
23621 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23622 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
23623 use_reg (&use, pic_offset_table_rtx);
23626 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
23628 rtx al = gen_rtx_REG (QImode, AX_REG);
23629 emit_move_insn (al, callarg2);
23630 use_reg (&use, al);
23633 if (ix86_cmodel == CM_LARGE_PIC
23634 && MEM_P (fnaddr)
23635 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23636 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
23637 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
23638 else if (sibcall
23639 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
23640 : !call_insn_operand (XEXP (fnaddr, 0), word_mode))
23642 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
23643 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
23646 vec_len = 0;
23647 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
23648 if (retval)
23649 call = gen_rtx_SET (VOIDmode, retval, call);
23650 vec[vec_len++] = call;
23652 if (pop)
23654 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
23655 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
23656 vec[vec_len++] = pop;
23659 if (TARGET_64BIT_MS_ABI
23660 && (!callarg2 || INTVAL (callarg2) != -2))
23662 unsigned i;
23664 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
23665 UNSPEC_MS_TO_SYSV_CALL);
23667 for (i = 0; i < ARRAY_SIZE (clobbered_registers); i++)
23668 vec[vec_len++]
23669 = gen_rtx_CLOBBER (VOIDmode,
23670 gen_rtx_REG (SSE_REGNO_P (clobbered_registers[i])
23671 ? TImode : DImode,
23672 clobbered_registers[i]));
23675 if (vec_len > 1)
23676 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
23677 call = emit_call_insn (call);
23678 if (use)
23679 CALL_INSN_FUNCTION_USAGE (call) = use;
23681 return call;
23684 /* Output the assembly for a call instruction. */
23686 const char *
23687 ix86_output_call_insn (rtx insn, rtx call_op)
23689 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
23690 bool seh_nop_p = false;
23691 const char *xasm;
23693 if (SIBLING_CALL_P (insn))
23695 if (direct_p)
23696 xasm = "jmp\t%P0";
23697 /* SEH epilogue detection requires the indirect branch case
23698 to include REX.W. */
23699 else if (TARGET_SEH)
23700 xasm = "rex.W jmp %A0";
23701 else
23702 xasm = "jmp\t%A0";
23704 output_asm_insn (xasm, &call_op);
23705 return "";
23708 /* SEH unwinding can require an extra nop to be emitted in several
23709 circumstances. Determine if we have one of those. */
23710 if (TARGET_SEH)
23712 rtx i;
23714 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
23716 /* If we get to another real insn, we don't need the nop. */
23717 if (INSN_P (i))
23718 break;
23720 /* If we get to the epilogue note, prevent a catch region from
23721 being adjacent to the standard epilogue sequence. If non-
23722 call-exceptions, we'll have done this during epilogue emission. */
23723 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
23724 && !flag_non_call_exceptions
23725 && !can_throw_internal (insn))
23727 seh_nop_p = true;
23728 break;
23732 /* If we didn't find a real insn following the call, prevent the
23733 unwinder from looking into the next function. */
23734 if (i == NULL)
23735 seh_nop_p = true;
23738 if (direct_p)
23739 xasm = "call\t%P0";
23740 else
23741 xasm = "call\t%A0";
23743 output_asm_insn (xasm, &call_op);
23745 if (seh_nop_p)
23746 return "nop";
23748 return "";
23751 /* Clear stack slot assignments remembered from previous functions.
23752 This is called from INIT_EXPANDERS once before RTL is emitted for each
23753 function. */
23755 static struct machine_function *
23756 ix86_init_machine_status (void)
23758 struct machine_function *f;
23760 f = ggc_alloc_cleared_machine_function ();
23761 f->use_fast_prologue_epilogue_nregs = -1;
23762 f->call_abi = ix86_abi;
23764 return f;
23767 /* Return a MEM corresponding to a stack slot with mode MODE.
23768 Allocate a new slot if necessary.
23770 The RTL for a function can have several slots available: N is
23771 which slot to use. */
23774 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
23776 struct stack_local_entry *s;
23778 gcc_assert (n < MAX_386_STACK_LOCALS);
23780 for (s = ix86_stack_locals; s; s = s->next)
23781 if (s->mode == mode && s->n == n)
23782 return validize_mem (copy_rtx (s->rtl));
23784 s = ggc_alloc_stack_local_entry ();
23785 s->n = n;
23786 s->mode = mode;
23787 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
23789 s->next = ix86_stack_locals;
23790 ix86_stack_locals = s;
23791 return validize_mem (s->rtl);
23794 static void
23795 ix86_instantiate_decls (void)
23797 struct stack_local_entry *s;
23799 for (s = ix86_stack_locals; s; s = s->next)
23800 if (s->rtl != NULL_RTX)
23801 instantiate_decl_rtl (s->rtl);
23804 /* Calculate the length of the memory address in the instruction encoding.
23805 Includes addr32 prefix, does not include the one-byte modrm, opcode,
23806 or other prefixes. We never generate addr32 prefix for LEA insn. */
23809 memory_address_length (rtx addr, bool lea)
23811 struct ix86_address parts;
23812 rtx base, index, disp;
23813 int len;
23814 int ok;
23816 if (GET_CODE (addr) == PRE_DEC
23817 || GET_CODE (addr) == POST_INC
23818 || GET_CODE (addr) == PRE_MODIFY
23819 || GET_CODE (addr) == POST_MODIFY)
23820 return 0;
23822 ok = ix86_decompose_address (addr, &parts);
23823 gcc_assert (ok);
23825 len = (parts.seg == SEG_DEFAULT) ? 0 : 1;
23827 /* If this is not LEA instruction, add the length of addr32 prefix. */
23828 if (TARGET_64BIT && !lea
23829 && (SImode_address_operand (addr, VOIDmode)
23830 || (parts.base && GET_MODE (parts.base) == SImode)
23831 || (parts.index && GET_MODE (parts.index) == SImode)))
23832 len++;
23834 base = parts.base;
23835 index = parts.index;
23836 disp = parts.disp;
23838 if (base && GET_CODE (base) == SUBREG)
23839 base = SUBREG_REG (base);
23840 if (index && GET_CODE (index) == SUBREG)
23841 index = SUBREG_REG (index);
23843 gcc_assert (base == NULL_RTX || REG_P (base));
23844 gcc_assert (index == NULL_RTX || REG_P (index));
23846 /* Rule of thumb:
23847 - esp as the base always wants an index,
23848 - ebp as the base always wants a displacement,
23849 - r12 as the base always wants an index,
23850 - r13 as the base always wants a displacement. */
23852 /* Register Indirect. */
23853 if (base && !index && !disp)
23855 /* esp (for its index) and ebp (for its displacement) need
23856 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
23857 code. */
23858 if (base == arg_pointer_rtx
23859 || base == frame_pointer_rtx
23860 || REGNO (base) == SP_REG
23861 || REGNO (base) == BP_REG
23862 || REGNO (base) == R12_REG
23863 || REGNO (base) == R13_REG)
23864 len++;
23867 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
23868 is not disp32, but disp32(%rip), so for disp32
23869 SIB byte is needed, unless print_operand_address
23870 optimizes it into disp32(%rip) or (%rip) is implied
23871 by UNSPEC. */
23872 else if (disp && !base && !index)
23874 len += 4;
23875 if (TARGET_64BIT)
23877 rtx symbol = disp;
23879 if (GET_CODE (disp) == CONST)
23880 symbol = XEXP (disp, 0);
23881 if (GET_CODE (symbol) == PLUS
23882 && CONST_INT_P (XEXP (symbol, 1)))
23883 symbol = XEXP (symbol, 0);
23885 if (GET_CODE (symbol) != LABEL_REF
23886 && (GET_CODE (symbol) != SYMBOL_REF
23887 || SYMBOL_REF_TLS_MODEL (symbol) != 0)
23888 && (GET_CODE (symbol) != UNSPEC
23889 || (XINT (symbol, 1) != UNSPEC_GOTPCREL
23890 && XINT (symbol, 1) != UNSPEC_PCREL
23891 && XINT (symbol, 1) != UNSPEC_GOTNTPOFF)))
23892 len++;
23895 else
23897 /* Find the length of the displacement constant. */
23898 if (disp)
23900 if (base && satisfies_constraint_K (disp))
23901 len += 1;
23902 else
23903 len += 4;
23905 /* ebp always wants a displacement. Similarly r13. */
23906 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
23907 len++;
23909 /* An index requires the two-byte modrm form.... */
23910 if (index
23911 /* ...like esp (or r12), which always wants an index. */
23912 || base == arg_pointer_rtx
23913 || base == frame_pointer_rtx
23914 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
23915 len++;
23918 return len;
23921 /* Compute default value for "length_immediate" attribute. When SHORTFORM
23922 is set, expect that insn have 8bit immediate alternative. */
23924 ix86_attr_length_immediate_default (rtx insn, bool shortform)
23926 int len = 0;
23927 int i;
23928 extract_insn_cached (insn);
23929 for (i = recog_data.n_operands - 1; i >= 0; --i)
23930 if (CONSTANT_P (recog_data.operand[i]))
23932 enum attr_mode mode = get_attr_mode (insn);
23934 gcc_assert (!len);
23935 if (shortform && CONST_INT_P (recog_data.operand[i]))
23937 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
23938 switch (mode)
23940 case MODE_QI:
23941 len = 1;
23942 continue;
23943 case MODE_HI:
23944 ival = trunc_int_for_mode (ival, HImode);
23945 break;
23946 case MODE_SI:
23947 ival = trunc_int_for_mode (ival, SImode);
23948 break;
23949 default:
23950 break;
23952 if (IN_RANGE (ival, -128, 127))
23954 len = 1;
23955 continue;
23958 switch (mode)
23960 case MODE_QI:
23961 len = 1;
23962 break;
23963 case MODE_HI:
23964 len = 2;
23965 break;
23966 case MODE_SI:
23967 len = 4;
23968 break;
23969 /* Immediates for DImode instructions are encoded
23970 as 32bit sign extended values. */
23971 case MODE_DI:
23972 len = 4;
23973 break;
23974 default:
23975 fatal_insn ("unknown insn mode", insn);
23978 return len;
23981 /* Compute default value for "length_address" attribute. */
23983 ix86_attr_length_address_default (rtx insn)
23985 int i;
23987 if (get_attr_type (insn) == TYPE_LEA)
23989 rtx set = PATTERN (insn), addr;
23991 if (GET_CODE (set) == PARALLEL)
23992 set = XVECEXP (set, 0, 0);
23994 gcc_assert (GET_CODE (set) == SET);
23996 addr = SET_SRC (set);
23998 return memory_address_length (addr, true);
24001 extract_insn_cached (insn);
24002 for (i = recog_data.n_operands - 1; i >= 0; --i)
24003 if (MEM_P (recog_data.operand[i]))
24005 constrain_operands_cached (reload_completed);
24006 if (which_alternative != -1)
24008 const char *constraints = recog_data.constraints[i];
24009 int alt = which_alternative;
24011 while (*constraints == '=' || *constraints == '+')
24012 constraints++;
24013 while (alt-- > 0)
24014 while (*constraints++ != ',')
24016 /* Skip ignored operands. */
24017 if (*constraints == 'X')
24018 continue;
24020 return memory_address_length (XEXP (recog_data.operand[i], 0), false);
24022 return 0;
24025 /* Compute default value for "length_vex" attribute. It includes
24026 2 or 3 byte VEX prefix and 1 opcode byte. */
24029 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
24031 int i;
24033 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
24034 byte VEX prefix. */
24035 if (!has_0f_opcode || has_vex_w)
24036 return 3 + 1;
24038 /* We can always use 2 byte VEX prefix in 32bit. */
24039 if (!TARGET_64BIT)
24040 return 2 + 1;
24042 extract_insn_cached (insn);
24044 for (i = recog_data.n_operands - 1; i >= 0; --i)
24045 if (REG_P (recog_data.operand[i]))
24047 /* REX.W bit uses 3 byte VEX prefix. */
24048 if (GET_MODE (recog_data.operand[i]) == DImode
24049 && GENERAL_REG_P (recog_data.operand[i]))
24050 return 3 + 1;
24052 else
24054 /* REX.X or REX.B bits use 3 byte VEX prefix. */
24055 if (MEM_P (recog_data.operand[i])
24056 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
24057 return 3 + 1;
24060 return 2 + 1;
24063 /* Return the maximum number of instructions a cpu can issue. */
24065 static int
24066 ix86_issue_rate (void)
24068 switch (ix86_tune)
24070 case PROCESSOR_PENTIUM:
24071 case PROCESSOR_ATOM:
24072 case PROCESSOR_K6:
24073 case PROCESSOR_BTVER2:
24074 return 2;
24076 case PROCESSOR_PENTIUMPRO:
24077 case PROCESSOR_PENTIUM4:
24078 case PROCESSOR_CORE2:
24079 case PROCESSOR_COREI7:
24080 case PROCESSOR_HASWELL:
24081 case PROCESSOR_ATHLON:
24082 case PROCESSOR_K8:
24083 case PROCESSOR_AMDFAM10:
24084 case PROCESSOR_NOCONA:
24085 case PROCESSOR_GENERIC32:
24086 case PROCESSOR_GENERIC64:
24087 case PROCESSOR_BDVER1:
24088 case PROCESSOR_BDVER2:
24089 case PROCESSOR_BDVER3:
24090 case PROCESSOR_BTVER1:
24091 return 3;
24093 default:
24094 return 1;
24098 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
24099 by DEP_INSN and nothing set by DEP_INSN. */
24101 static bool
24102 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
24104 rtx set, set2;
24106 /* Simplify the test for uninteresting insns. */
24107 if (insn_type != TYPE_SETCC
24108 && insn_type != TYPE_ICMOV
24109 && insn_type != TYPE_FCMOV
24110 && insn_type != TYPE_IBR)
24111 return false;
24113 if ((set = single_set (dep_insn)) != 0)
24115 set = SET_DEST (set);
24116 set2 = NULL_RTX;
24118 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
24119 && XVECLEN (PATTERN (dep_insn), 0) == 2
24120 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
24121 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
24123 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
24124 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
24126 else
24127 return false;
24129 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
24130 return false;
24132 /* This test is true if the dependent insn reads the flags but
24133 not any other potentially set register. */
24134 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
24135 return false;
24137 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
24138 return false;
24140 return true;
24143 /* Return true iff USE_INSN has a memory address with operands set by
24144 SET_INSN. */
24146 bool
24147 ix86_agi_dependent (rtx set_insn, rtx use_insn)
24149 int i;
24150 extract_insn_cached (use_insn);
24151 for (i = recog_data.n_operands - 1; i >= 0; --i)
24152 if (MEM_P (recog_data.operand[i]))
24154 rtx addr = XEXP (recog_data.operand[i], 0);
24155 return modified_in_p (addr, set_insn) != 0;
24157 return false;
24160 static int
24161 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
24163 enum attr_type insn_type, dep_insn_type;
24164 enum attr_memory memory;
24165 rtx set, set2;
24166 int dep_insn_code_number;
24168 /* Anti and output dependencies have zero cost on all CPUs. */
24169 if (REG_NOTE_KIND (link) != 0)
24170 return 0;
24172 dep_insn_code_number = recog_memoized (dep_insn);
24174 /* If we can't recognize the insns, we can't really do anything. */
24175 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
24176 return cost;
24178 insn_type = get_attr_type (insn);
24179 dep_insn_type = get_attr_type (dep_insn);
24181 switch (ix86_tune)
24183 case PROCESSOR_PENTIUM:
24184 /* Address Generation Interlock adds a cycle of latency. */
24185 if (insn_type == TYPE_LEA)
24187 rtx addr = PATTERN (insn);
24189 if (GET_CODE (addr) == PARALLEL)
24190 addr = XVECEXP (addr, 0, 0);
24192 gcc_assert (GET_CODE (addr) == SET);
24194 addr = SET_SRC (addr);
24195 if (modified_in_p (addr, dep_insn))
24196 cost += 1;
24198 else if (ix86_agi_dependent (dep_insn, insn))
24199 cost += 1;
24201 /* ??? Compares pair with jump/setcc. */
24202 if (ix86_flags_dependent (insn, dep_insn, insn_type))
24203 cost = 0;
24205 /* Floating point stores require value to be ready one cycle earlier. */
24206 if (insn_type == TYPE_FMOV
24207 && get_attr_memory (insn) == MEMORY_STORE
24208 && !ix86_agi_dependent (dep_insn, insn))
24209 cost += 1;
24210 break;
24212 case PROCESSOR_PENTIUMPRO:
24213 memory = get_attr_memory (insn);
24215 /* INT->FP conversion is expensive. */
24216 if (get_attr_fp_int_src (dep_insn))
24217 cost += 5;
24219 /* There is one cycle extra latency between an FP op and a store. */
24220 if (insn_type == TYPE_FMOV
24221 && (set = single_set (dep_insn)) != NULL_RTX
24222 && (set2 = single_set (insn)) != NULL_RTX
24223 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
24224 && MEM_P (SET_DEST (set2)))
24225 cost += 1;
24227 /* Show ability of reorder buffer to hide latency of load by executing
24228 in parallel with previous instruction in case
24229 previous instruction is not needed to compute the address. */
24230 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24231 && !ix86_agi_dependent (dep_insn, insn))
24233 /* Claim moves to take one cycle, as core can issue one load
24234 at time and the next load can start cycle later. */
24235 if (dep_insn_type == TYPE_IMOV
24236 || dep_insn_type == TYPE_FMOV)
24237 cost = 1;
24238 else if (cost > 1)
24239 cost--;
24241 break;
24243 case PROCESSOR_K6:
24244 memory = get_attr_memory (insn);
24246 /* The esp dependency is resolved before the instruction is really
24247 finished. */
24248 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
24249 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
24250 return 1;
24252 /* INT->FP conversion is expensive. */
24253 if (get_attr_fp_int_src (dep_insn))
24254 cost += 5;
24256 /* Show ability of reorder buffer to hide latency of load by executing
24257 in parallel with previous instruction in case
24258 previous instruction is not needed to compute the address. */
24259 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24260 && !ix86_agi_dependent (dep_insn, insn))
24262 /* Claim moves to take one cycle, as core can issue one load
24263 at time and the next load can start cycle later. */
24264 if (dep_insn_type == TYPE_IMOV
24265 || dep_insn_type == TYPE_FMOV)
24266 cost = 1;
24267 else if (cost > 2)
24268 cost -= 2;
24269 else
24270 cost = 1;
24272 break;
24274 case PROCESSOR_ATHLON:
24275 case PROCESSOR_K8:
24276 case PROCESSOR_AMDFAM10:
24277 case PROCESSOR_BDVER1:
24278 case PROCESSOR_BDVER2:
24279 case PROCESSOR_BDVER3:
24280 case PROCESSOR_BTVER1:
24281 case PROCESSOR_BTVER2:
24282 case PROCESSOR_ATOM:
24283 case PROCESSOR_GENERIC32:
24284 case PROCESSOR_GENERIC64:
24285 memory = get_attr_memory (insn);
24287 /* Show ability of reorder buffer to hide latency of load by executing
24288 in parallel with previous instruction in case
24289 previous instruction is not needed to compute the address. */
24290 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24291 && !ix86_agi_dependent (dep_insn, insn))
24293 enum attr_unit unit = get_attr_unit (insn);
24294 int loadcost = 3;
24296 /* Because of the difference between the length of integer and
24297 floating unit pipeline preparation stages, the memory operands
24298 for floating point are cheaper.
24300 ??? For Athlon it the difference is most probably 2. */
24301 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
24302 loadcost = 3;
24303 else
24304 loadcost = TARGET_ATHLON ? 2 : 0;
24306 if (cost >= loadcost)
24307 cost -= loadcost;
24308 else
24309 cost = 0;
24312 default:
24313 break;
24316 return cost;
24319 /* How many alternative schedules to try. This should be as wide as the
24320 scheduling freedom in the DFA, but no wider. Making this value too
24321 large results extra work for the scheduler. */
24323 static int
24324 ia32_multipass_dfa_lookahead (void)
24326 switch (ix86_tune)
24328 case PROCESSOR_PENTIUM:
24329 return 2;
24331 case PROCESSOR_PENTIUMPRO:
24332 case PROCESSOR_K6:
24333 return 1;
24335 case PROCESSOR_CORE2:
24336 case PROCESSOR_COREI7:
24337 case PROCESSOR_HASWELL:
24338 case PROCESSOR_ATOM:
24339 /* Generally, we want haifa-sched:max_issue() to look ahead as far
24340 as many instructions can be executed on a cycle, i.e.,
24341 issue_rate. I wonder why tuning for many CPUs does not do this. */
24342 if (reload_completed)
24343 return ix86_issue_rate ();
24344 /* Don't use lookahead for pre-reload schedule to save compile time. */
24345 return 0;
24347 default:
24348 return 0;
24352 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
24353 execution. It is applied if
24354 (1) IMUL instruction is on the top of list;
24355 (2) There exists the only producer of independent IMUL instruction in
24356 ready list;
24357 (3) Put found producer on the top of ready list.
24358 Returns issue rate. */
24360 static int
24361 ix86_sched_reorder(FILE *dump, int sched_verbose, rtx *ready, int *pn_ready,
24362 int clock_var ATTRIBUTE_UNUSED)
24364 static int issue_rate = -1;
24365 int n_ready = *pn_ready;
24366 rtx insn, insn1, insn2;
24367 int i;
24368 sd_iterator_def sd_it;
24369 dep_t dep;
24370 int index = -1;
24372 /* Set up issue rate. */
24373 issue_rate = ix86_issue_rate();
24375 /* Do reodering for Atom only. */
24376 if (ix86_tune != PROCESSOR_ATOM)
24377 return issue_rate;
24378 /* Do not perform ready list reodering for pre-reload schedule pass. */
24379 if (!reload_completed)
24380 return issue_rate;
24381 /* Nothing to do if ready list contains only 1 instruction. */
24382 if (n_ready <= 1)
24383 return issue_rate;
24385 /* Check that IMUL instruction is on the top of ready list. */
24386 insn = ready[n_ready - 1];
24387 if (!NONDEBUG_INSN_P (insn))
24388 return issue_rate;
24389 insn = PATTERN (insn);
24390 if (GET_CODE (insn) == PARALLEL)
24391 insn = XVECEXP (insn, 0, 0);
24392 if (GET_CODE (insn) != SET)
24393 return issue_rate;
24394 if (!(GET_CODE (SET_SRC (insn)) == MULT
24395 && GET_MODE (SET_SRC (insn)) == SImode))
24396 return issue_rate;
24398 /* Search for producer of independent IMUL instruction. */
24399 for (i = n_ready - 2; i>= 0; i--)
24401 insn = ready[i];
24402 if (!NONDEBUG_INSN_P (insn))
24403 continue;
24404 /* Skip IMUL instruction. */
24405 insn2 = PATTERN (insn);
24406 if (GET_CODE (insn2) == PARALLEL)
24407 insn2 = XVECEXP (insn2, 0, 0);
24408 if (GET_CODE (insn2) == SET
24409 && GET_CODE (SET_SRC (insn2)) == MULT
24410 && GET_MODE (SET_SRC (insn2)) == SImode)
24411 continue;
24413 FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
24415 rtx con;
24416 con = DEP_CON (dep);
24417 if (!NONDEBUG_INSN_P (con))
24418 continue;
24419 insn1 = PATTERN (con);
24420 if (GET_CODE (insn1) == PARALLEL)
24421 insn1 = XVECEXP (insn1, 0, 0);
24423 if (GET_CODE (insn1) == SET
24424 && GET_CODE (SET_SRC (insn1)) == MULT
24425 && GET_MODE (SET_SRC (insn1)) == SImode)
24427 sd_iterator_def sd_it1;
24428 dep_t dep1;
24429 /* Check if there is no other dependee for IMUL. */
24430 index = i;
24431 FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
24433 rtx pro;
24434 pro = DEP_PRO (dep1);
24435 if (!NONDEBUG_INSN_P (pro))
24436 continue;
24437 if (pro != insn)
24438 index = -1;
24440 if (index >= 0)
24441 break;
24444 if (index >= 0)
24445 break;
24447 if (index < 0)
24448 return issue_rate; /* Didn't find IMUL producer. */
24450 if (sched_verbose > 1)
24451 fprintf(dump, ";;\tatom sched_reorder: swap %d and %d insns\n",
24452 INSN_UID (ready[index]), INSN_UID (ready[n_ready - 1]));
24454 /* Put IMUL producer (ready[index]) at the top of ready list. */
24455 insn1= ready[index];
24456 for (i = index; i < n_ready - 1; i++)
24457 ready[i] = ready[i + 1];
24458 ready[n_ready - 1] = insn1;
24460 return issue_rate;
24463 static bool
24464 ix86_class_likely_spilled_p (reg_class_t);
24466 /* Returns true if lhs of insn is HW function argument register and set up
24467 is_spilled to true if it is likely spilled HW register. */
24468 static bool
24469 insn_is_function_arg (rtx insn, bool* is_spilled)
24471 rtx dst;
24473 if (!NONDEBUG_INSN_P (insn))
24474 return false;
24475 /* Call instructions are not movable, ignore it. */
24476 if (CALL_P (insn))
24477 return false;
24478 insn = PATTERN (insn);
24479 if (GET_CODE (insn) == PARALLEL)
24480 insn = XVECEXP (insn, 0, 0);
24481 if (GET_CODE (insn) != SET)
24482 return false;
24483 dst = SET_DEST (insn);
24484 if (REG_P (dst) && HARD_REGISTER_P (dst)
24485 && ix86_function_arg_regno_p (REGNO (dst)))
24487 /* Is it likely spilled HW register? */
24488 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
24489 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
24490 *is_spilled = true;
24491 return true;
24493 return false;
24496 /* Add output dependencies for chain of function adjacent arguments if only
24497 there is a move to likely spilled HW register. Return first argument
24498 if at least one dependence was added or NULL otherwise. */
24499 static rtx
24500 add_parameter_dependencies (rtx call, rtx head)
24502 rtx insn;
24503 rtx last = call;
24504 rtx first_arg = NULL;
24505 bool is_spilled = false;
24507 head = PREV_INSN (head);
24509 /* Find nearest to call argument passing instruction. */
24510 while (true)
24512 last = PREV_INSN (last);
24513 if (last == head)
24514 return NULL;
24515 if (!NONDEBUG_INSN_P (last))
24516 continue;
24517 if (insn_is_function_arg (last, &is_spilled))
24518 break;
24519 return NULL;
24522 first_arg = last;
24523 while (true)
24525 insn = PREV_INSN (last);
24526 if (!INSN_P (insn))
24527 break;
24528 if (insn == head)
24529 break;
24530 if (!NONDEBUG_INSN_P (insn))
24532 last = insn;
24533 continue;
24535 if (insn_is_function_arg (insn, &is_spilled))
24537 /* Add output depdendence between two function arguments if chain
24538 of output arguments contains likely spilled HW registers. */
24539 if (is_spilled)
24540 add_dependence (last, insn, REG_DEP_OUTPUT);
24541 first_arg = last = insn;
24543 else
24544 break;
24546 if (!is_spilled)
24547 return NULL;
24548 return first_arg;
24551 /* Add output or anti dependency from insn to first_arg to restrict its code
24552 motion. */
24553 static void
24554 avoid_func_arg_motion (rtx first_arg, rtx insn)
24556 rtx set;
24557 rtx tmp;
24559 set = single_set (insn);
24560 if (!set)
24561 return;
24562 tmp = SET_DEST (set);
24563 if (REG_P (tmp))
24565 /* Add output dependency to the first function argument. */
24566 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
24567 return;
24569 /* Add anti dependency. */
24570 add_dependence (first_arg, insn, REG_DEP_ANTI);
24573 /* Avoid cross block motion of function argument through adding dependency
24574 from the first non-jump instruction in bb. */
24575 static void
24576 add_dependee_for_func_arg (rtx arg, basic_block bb)
24578 rtx insn = BB_END (bb);
24580 while (insn)
24582 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
24584 rtx set = single_set (insn);
24585 if (set)
24587 avoid_func_arg_motion (arg, insn);
24588 return;
24591 if (insn == BB_HEAD (bb))
24592 return;
24593 insn = PREV_INSN (insn);
24597 /* Hook for pre-reload schedule - avoid motion of function arguments
24598 passed in likely spilled HW registers. */
24599 static void
24600 ix86_dependencies_evaluation_hook (rtx head, rtx tail)
24602 rtx insn;
24603 rtx first_arg = NULL;
24604 if (reload_completed)
24605 return;
24606 while (head != tail && DEBUG_INSN_P (head))
24607 head = NEXT_INSN (head);
24608 for (insn = tail; insn != head; insn = PREV_INSN (insn))
24609 if (INSN_P (insn) && CALL_P (insn))
24611 first_arg = add_parameter_dependencies (insn, head);
24612 if (first_arg)
24614 /* Add dependee for first argument to predecessors if only
24615 region contains more than one block. */
24616 basic_block bb = BLOCK_FOR_INSN (insn);
24617 int rgn = CONTAINING_RGN (bb->index);
24618 int nr_blks = RGN_NR_BLOCKS (rgn);
24619 /* Skip trivial regions and region head blocks that can have
24620 predecessors outside of region. */
24621 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
24623 edge e;
24624 edge_iterator ei;
24625 /* Assume that region is SCC, i.e. all immediate predecessors
24626 of non-head block are in the same region. */
24627 FOR_EACH_EDGE (e, ei, bb->preds)
24629 /* Avoid creating of loop-carried dependencies through
24630 using topological odering in region. */
24631 if (BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
24632 add_dependee_for_func_arg (first_arg, e->src);
24635 insn = first_arg;
24636 if (insn == head)
24637 break;
24640 else if (first_arg)
24641 avoid_func_arg_motion (first_arg, insn);
24644 /* Hook for pre-reload schedule - set priority of moves from likely spilled
24645 HW registers to maximum, to schedule them at soon as possible. These are
24646 moves from function argument registers at the top of the function entry
24647 and moves from function return value registers after call. */
24648 static int
24649 ix86_adjust_priority (rtx insn, int priority)
24651 rtx set;
24653 if (reload_completed)
24654 return priority;
24656 if (!NONDEBUG_INSN_P (insn))
24657 return priority;
24659 set = single_set (insn);
24660 if (set)
24662 rtx tmp = SET_SRC (set);
24663 if (REG_P (tmp)
24664 && HARD_REGISTER_P (tmp)
24665 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
24666 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
24667 return current_sched_info->sched_max_insns_priority;
24670 return priority;
24673 /* Model decoder of Core 2/i7.
24674 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
24675 track the instruction fetch block boundaries and make sure that long
24676 (9+ bytes) instructions are assigned to D0. */
24678 /* Maximum length of an insn that can be handled by
24679 a secondary decoder unit. '8' for Core 2/i7. */
24680 static int core2i7_secondary_decoder_max_insn_size;
24682 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
24683 '16' for Core 2/i7. */
24684 static int core2i7_ifetch_block_size;
24686 /* Maximum number of instructions decoder can handle per cycle.
24687 '6' for Core 2/i7. */
24688 static int core2i7_ifetch_block_max_insns;
24690 typedef struct ix86_first_cycle_multipass_data_ *
24691 ix86_first_cycle_multipass_data_t;
24692 typedef const struct ix86_first_cycle_multipass_data_ *
24693 const_ix86_first_cycle_multipass_data_t;
24695 /* A variable to store target state across calls to max_issue within
24696 one cycle. */
24697 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
24698 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
24700 /* Initialize DATA. */
24701 static void
24702 core2i7_first_cycle_multipass_init (void *_data)
24704 ix86_first_cycle_multipass_data_t data
24705 = (ix86_first_cycle_multipass_data_t) _data;
24707 data->ifetch_block_len = 0;
24708 data->ifetch_block_n_insns = 0;
24709 data->ready_try_change = NULL;
24710 data->ready_try_change_size = 0;
24713 /* Advancing the cycle; reset ifetch block counts. */
24714 static void
24715 core2i7_dfa_post_advance_cycle (void)
24717 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
24719 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
24721 data->ifetch_block_len = 0;
24722 data->ifetch_block_n_insns = 0;
24725 static int min_insn_size (rtx);
24727 /* Filter out insns from ready_try that the core will not be able to issue
24728 on current cycle due to decoder. */
24729 static void
24730 core2i7_first_cycle_multipass_filter_ready_try
24731 (const_ix86_first_cycle_multipass_data_t data,
24732 char *ready_try, int n_ready, bool first_cycle_insn_p)
24734 while (n_ready--)
24736 rtx insn;
24737 int insn_size;
24739 if (ready_try[n_ready])
24740 continue;
24742 insn = get_ready_element (n_ready);
24743 insn_size = min_insn_size (insn);
24745 if (/* If this is a too long an insn for a secondary decoder ... */
24746 (!first_cycle_insn_p
24747 && insn_size > core2i7_secondary_decoder_max_insn_size)
24748 /* ... or it would not fit into the ifetch block ... */
24749 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
24750 /* ... or the decoder is full already ... */
24751 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
24752 /* ... mask the insn out. */
24754 ready_try[n_ready] = 1;
24756 if (data->ready_try_change)
24757 bitmap_set_bit (data->ready_try_change, n_ready);
24762 /* Prepare for a new round of multipass lookahead scheduling. */
24763 static void
24764 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
24765 bool first_cycle_insn_p)
24767 ix86_first_cycle_multipass_data_t data
24768 = (ix86_first_cycle_multipass_data_t) _data;
24769 const_ix86_first_cycle_multipass_data_t prev_data
24770 = ix86_first_cycle_multipass_data;
24772 /* Restore the state from the end of the previous round. */
24773 data->ifetch_block_len = prev_data->ifetch_block_len;
24774 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
24776 /* Filter instructions that cannot be issued on current cycle due to
24777 decoder restrictions. */
24778 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24779 first_cycle_insn_p);
24782 /* INSN is being issued in current solution. Account for its impact on
24783 the decoder model. */
24784 static void
24785 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
24786 rtx insn, const void *_prev_data)
24788 ix86_first_cycle_multipass_data_t data
24789 = (ix86_first_cycle_multipass_data_t) _data;
24790 const_ix86_first_cycle_multipass_data_t prev_data
24791 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
24793 int insn_size = min_insn_size (insn);
24795 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
24796 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
24797 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
24798 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
24800 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
24801 if (!data->ready_try_change)
24803 data->ready_try_change = sbitmap_alloc (n_ready);
24804 data->ready_try_change_size = n_ready;
24806 else if (data->ready_try_change_size < n_ready)
24808 data->ready_try_change = sbitmap_resize (data->ready_try_change,
24809 n_ready, 0);
24810 data->ready_try_change_size = n_ready;
24812 bitmap_clear (data->ready_try_change);
24814 /* Filter out insns from ready_try that the core will not be able to issue
24815 on current cycle due to decoder. */
24816 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24817 false);
24820 /* Revert the effect on ready_try. */
24821 static void
24822 core2i7_first_cycle_multipass_backtrack (const void *_data,
24823 char *ready_try,
24824 int n_ready ATTRIBUTE_UNUSED)
24826 const_ix86_first_cycle_multipass_data_t data
24827 = (const_ix86_first_cycle_multipass_data_t) _data;
24828 unsigned int i = 0;
24829 sbitmap_iterator sbi;
24831 gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready);
24832 EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi)
24834 ready_try[i] = 0;
24838 /* Save the result of multipass lookahead scheduling for the next round. */
24839 static void
24840 core2i7_first_cycle_multipass_end (const void *_data)
24842 const_ix86_first_cycle_multipass_data_t data
24843 = (const_ix86_first_cycle_multipass_data_t) _data;
24844 ix86_first_cycle_multipass_data_t next_data
24845 = ix86_first_cycle_multipass_data;
24847 if (data != NULL)
24849 next_data->ifetch_block_len = data->ifetch_block_len;
24850 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
24854 /* Deallocate target data. */
24855 static void
24856 core2i7_first_cycle_multipass_fini (void *_data)
24858 ix86_first_cycle_multipass_data_t data
24859 = (ix86_first_cycle_multipass_data_t) _data;
24861 if (data->ready_try_change)
24863 sbitmap_free (data->ready_try_change);
24864 data->ready_try_change = NULL;
24865 data->ready_try_change_size = 0;
24869 /* Prepare for scheduling pass. */
24870 static void
24871 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
24872 int verbose ATTRIBUTE_UNUSED,
24873 int max_uid ATTRIBUTE_UNUSED)
24875 /* Install scheduling hooks for current CPU. Some of these hooks are used
24876 in time-critical parts of the scheduler, so we only set them up when
24877 they are actually used. */
24878 switch (ix86_tune)
24880 case PROCESSOR_CORE2:
24881 case PROCESSOR_COREI7:
24882 case PROCESSOR_HASWELL:
24883 /* Do not perform multipass scheduling for pre-reload schedule
24884 to save compile time. */
24885 if (reload_completed)
24887 targetm.sched.dfa_post_advance_cycle
24888 = core2i7_dfa_post_advance_cycle;
24889 targetm.sched.first_cycle_multipass_init
24890 = core2i7_first_cycle_multipass_init;
24891 targetm.sched.first_cycle_multipass_begin
24892 = core2i7_first_cycle_multipass_begin;
24893 targetm.sched.first_cycle_multipass_issue
24894 = core2i7_first_cycle_multipass_issue;
24895 targetm.sched.first_cycle_multipass_backtrack
24896 = core2i7_first_cycle_multipass_backtrack;
24897 targetm.sched.first_cycle_multipass_end
24898 = core2i7_first_cycle_multipass_end;
24899 targetm.sched.first_cycle_multipass_fini
24900 = core2i7_first_cycle_multipass_fini;
24902 /* Set decoder parameters. */
24903 core2i7_secondary_decoder_max_insn_size = 8;
24904 core2i7_ifetch_block_size = 16;
24905 core2i7_ifetch_block_max_insns = 6;
24906 break;
24908 /* ... Fall through ... */
24909 default:
24910 targetm.sched.dfa_post_advance_cycle = NULL;
24911 targetm.sched.first_cycle_multipass_init = NULL;
24912 targetm.sched.first_cycle_multipass_begin = NULL;
24913 targetm.sched.first_cycle_multipass_issue = NULL;
24914 targetm.sched.first_cycle_multipass_backtrack = NULL;
24915 targetm.sched.first_cycle_multipass_end = NULL;
24916 targetm.sched.first_cycle_multipass_fini = NULL;
24917 break;
24922 /* Compute the alignment given to a constant that is being placed in memory.
24923 EXP is the constant and ALIGN is the alignment that the object would
24924 ordinarily have.
24925 The value of this function is used instead of that alignment to align
24926 the object. */
24929 ix86_constant_alignment (tree exp, int align)
24931 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
24932 || TREE_CODE (exp) == INTEGER_CST)
24934 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
24935 return 64;
24936 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
24937 return 128;
24939 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
24940 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
24941 return BITS_PER_WORD;
24943 return align;
24946 /* Compute the alignment for a static variable.
24947 TYPE is the data type, and ALIGN is the alignment that
24948 the object would ordinarily have. The value of this function is used
24949 instead of that alignment to align the object. */
24952 ix86_data_alignment (tree type, int align)
24954 int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
24956 if (AGGREGATE_TYPE_P (type)
24957 && TYPE_SIZE (type)
24958 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24959 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
24960 || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
24961 && align < max_align)
24962 align = max_align;
24964 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
24965 to 16byte boundary. */
24966 if (TARGET_64BIT)
24968 if (AGGREGATE_TYPE_P (type)
24969 && TYPE_SIZE (type)
24970 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24971 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
24972 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24973 return 128;
24976 if (TREE_CODE (type) == ARRAY_TYPE)
24978 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24979 return 64;
24980 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
24981 return 128;
24983 else if (TREE_CODE (type) == COMPLEX_TYPE)
24986 if (TYPE_MODE (type) == DCmode && align < 64)
24987 return 64;
24988 if ((TYPE_MODE (type) == XCmode
24989 || TYPE_MODE (type) == TCmode) && align < 128)
24990 return 128;
24992 else if ((TREE_CODE (type) == RECORD_TYPE
24993 || TREE_CODE (type) == UNION_TYPE
24994 || TREE_CODE (type) == QUAL_UNION_TYPE)
24995 && TYPE_FIELDS (type))
24997 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
24998 return 64;
24999 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
25000 return 128;
25002 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
25003 || TREE_CODE (type) == INTEGER_TYPE)
25005 if (TYPE_MODE (type) == DFmode && align < 64)
25006 return 64;
25007 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
25008 return 128;
25011 return align;
25014 /* Compute the alignment for a local variable or a stack slot. EXP is
25015 the data type or decl itself, MODE is the widest mode available and
25016 ALIGN is the alignment that the object would ordinarily have. The
25017 value of this macro is used instead of that alignment to align the
25018 object. */
25020 unsigned int
25021 ix86_local_alignment (tree exp, enum machine_mode mode,
25022 unsigned int align)
25024 tree type, decl;
25026 if (exp && DECL_P (exp))
25028 type = TREE_TYPE (exp);
25029 decl = exp;
25031 else
25033 type = exp;
25034 decl = NULL;
25037 /* Don't do dynamic stack realignment for long long objects with
25038 -mpreferred-stack-boundary=2. */
25039 if (!TARGET_64BIT
25040 && align == 64
25041 && ix86_preferred_stack_boundary < 64
25042 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
25043 && (!type || !TYPE_USER_ALIGN (type))
25044 && (!decl || !DECL_USER_ALIGN (decl)))
25045 align = 32;
25047 /* If TYPE is NULL, we are allocating a stack slot for caller-save
25048 register in MODE. We will return the largest alignment of XF
25049 and DF. */
25050 if (!type)
25052 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
25053 align = GET_MODE_ALIGNMENT (DFmode);
25054 return align;
25057 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
25058 to 16byte boundary. Exact wording is:
25060 An array uses the same alignment as its elements, except that a local or
25061 global array variable of length at least 16 bytes or
25062 a C99 variable-length array variable always has alignment of at least 16 bytes.
25064 This was added to allow use of aligned SSE instructions at arrays. This
25065 rule is meant for static storage (where compiler can not do the analysis
25066 by itself). We follow it for automatic variables only when convenient.
25067 We fully control everything in the function compiled and functions from
25068 other unit can not rely on the alignment.
25070 Exclude va_list type. It is the common case of local array where
25071 we can not benefit from the alignment. */
25072 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
25073 && TARGET_SSE)
25075 if (AGGREGATE_TYPE_P (type)
25076 && (va_list_type_node == NULL_TREE
25077 || (TYPE_MAIN_VARIANT (type)
25078 != TYPE_MAIN_VARIANT (va_list_type_node)))
25079 && TYPE_SIZE (type)
25080 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
25081 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
25082 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
25083 return 128;
25085 if (TREE_CODE (type) == ARRAY_TYPE)
25087 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
25088 return 64;
25089 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
25090 return 128;
25092 else if (TREE_CODE (type) == COMPLEX_TYPE)
25094 if (TYPE_MODE (type) == DCmode && align < 64)
25095 return 64;
25096 if ((TYPE_MODE (type) == XCmode
25097 || TYPE_MODE (type) == TCmode) && align < 128)
25098 return 128;
25100 else if ((TREE_CODE (type) == RECORD_TYPE
25101 || TREE_CODE (type) == UNION_TYPE
25102 || TREE_CODE (type) == QUAL_UNION_TYPE)
25103 && TYPE_FIELDS (type))
25105 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
25106 return 64;
25107 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
25108 return 128;
25110 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
25111 || TREE_CODE (type) == INTEGER_TYPE)
25114 if (TYPE_MODE (type) == DFmode && align < 64)
25115 return 64;
25116 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
25117 return 128;
25119 return align;
25122 /* Compute the minimum required alignment for dynamic stack realignment
25123 purposes for a local variable, parameter or a stack slot. EXP is
25124 the data type or decl itself, MODE is its mode and ALIGN is the
25125 alignment that the object would ordinarily have. */
25127 unsigned int
25128 ix86_minimum_alignment (tree exp, enum machine_mode mode,
25129 unsigned int align)
25131 tree type, decl;
25133 if (exp && DECL_P (exp))
25135 type = TREE_TYPE (exp);
25136 decl = exp;
25138 else
25140 type = exp;
25141 decl = NULL;
25144 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
25145 return align;
25147 /* Don't do dynamic stack realignment for long long objects with
25148 -mpreferred-stack-boundary=2. */
25149 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
25150 && (!type || !TYPE_USER_ALIGN (type))
25151 && (!decl || !DECL_USER_ALIGN (decl)))
25152 return 32;
25154 return align;
25157 /* Find a location for the static chain incoming to a nested function.
25158 This is a register, unless all free registers are used by arguments. */
25160 static rtx
25161 ix86_static_chain (const_tree fndecl, bool incoming_p)
25163 unsigned regno;
25165 if (!DECL_STATIC_CHAIN (fndecl))
25166 return NULL;
25168 if (TARGET_64BIT)
25170 /* We always use R10 in 64-bit mode. */
25171 regno = R10_REG;
25173 else
25175 tree fntype;
25176 unsigned int ccvt;
25178 /* By default in 32-bit mode we use ECX to pass the static chain. */
25179 regno = CX_REG;
25181 fntype = TREE_TYPE (fndecl);
25182 ccvt = ix86_get_callcvt (fntype);
25183 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
25185 /* Fastcall functions use ecx/edx for arguments, which leaves
25186 us with EAX for the static chain.
25187 Thiscall functions use ecx for arguments, which also
25188 leaves us with EAX for the static chain. */
25189 regno = AX_REG;
25191 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
25193 /* Thiscall functions use ecx for arguments, which leaves
25194 us with EAX and EDX for the static chain.
25195 We are using for abi-compatibility EAX. */
25196 regno = AX_REG;
25198 else if (ix86_function_regparm (fntype, fndecl) == 3)
25200 /* For regparm 3, we have no free call-clobbered registers in
25201 which to store the static chain. In order to implement this,
25202 we have the trampoline push the static chain to the stack.
25203 However, we can't push a value below the return address when
25204 we call the nested function directly, so we have to use an
25205 alternate entry point. For this we use ESI, and have the
25206 alternate entry point push ESI, so that things appear the
25207 same once we're executing the nested function. */
25208 if (incoming_p)
25210 if (fndecl == current_function_decl)
25211 ix86_static_chain_on_stack = true;
25212 return gen_frame_mem (SImode,
25213 plus_constant (Pmode,
25214 arg_pointer_rtx, -8));
25216 regno = SI_REG;
25220 return gen_rtx_REG (Pmode, regno);
25223 /* Emit RTL insns to initialize the variable parts of a trampoline.
25224 FNDECL is the decl of the target address; M_TRAMP is a MEM for
25225 the trampoline, and CHAIN_VALUE is an RTX for the static chain
25226 to be passed to the target function. */
25228 static void
25229 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
25231 rtx mem, fnaddr;
25232 int opcode;
25233 int offset = 0;
25235 fnaddr = XEXP (DECL_RTL (fndecl), 0);
25237 if (TARGET_64BIT)
25239 int size;
25241 /* Load the function address to r11. Try to load address using
25242 the shorter movl instead of movabs. We may want to support
25243 movq for kernel mode, but kernel does not use trampolines at
25244 the moment. FNADDR is a 32bit address and may not be in
25245 DImode when ptr_mode == SImode. Always use movl in this
25246 case. */
25247 if (ptr_mode == SImode
25248 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
25250 fnaddr = copy_addr_to_reg (fnaddr);
25252 mem = adjust_address (m_tramp, HImode, offset);
25253 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
25255 mem = adjust_address (m_tramp, SImode, offset + 2);
25256 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
25257 offset += 6;
25259 else
25261 mem = adjust_address (m_tramp, HImode, offset);
25262 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
25264 mem = adjust_address (m_tramp, DImode, offset + 2);
25265 emit_move_insn (mem, fnaddr);
25266 offset += 10;
25269 /* Load static chain using movabs to r10. Use the shorter movl
25270 instead of movabs when ptr_mode == SImode. */
25271 if (ptr_mode == SImode)
25273 opcode = 0xba41;
25274 size = 6;
25276 else
25278 opcode = 0xba49;
25279 size = 10;
25282 mem = adjust_address (m_tramp, HImode, offset);
25283 emit_move_insn (mem, gen_int_mode (opcode, HImode));
25285 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
25286 emit_move_insn (mem, chain_value);
25287 offset += size;
25289 /* Jump to r11; the last (unused) byte is a nop, only there to
25290 pad the write out to a single 32-bit store. */
25291 mem = adjust_address (m_tramp, SImode, offset);
25292 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
25293 offset += 4;
25295 else
25297 rtx disp, chain;
25299 /* Depending on the static chain location, either load a register
25300 with a constant, or push the constant to the stack. All of the
25301 instructions are the same size. */
25302 chain = ix86_static_chain (fndecl, true);
25303 if (REG_P (chain))
25305 switch (REGNO (chain))
25307 case AX_REG:
25308 opcode = 0xb8; break;
25309 case CX_REG:
25310 opcode = 0xb9; break;
25311 default:
25312 gcc_unreachable ();
25315 else
25316 opcode = 0x68;
25318 mem = adjust_address (m_tramp, QImode, offset);
25319 emit_move_insn (mem, gen_int_mode (opcode, QImode));
25321 mem = adjust_address (m_tramp, SImode, offset + 1);
25322 emit_move_insn (mem, chain_value);
25323 offset += 5;
25325 mem = adjust_address (m_tramp, QImode, offset);
25326 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
25328 mem = adjust_address (m_tramp, SImode, offset + 1);
25330 /* Compute offset from the end of the jmp to the target function.
25331 In the case in which the trampoline stores the static chain on
25332 the stack, we need to skip the first insn which pushes the
25333 (call-saved) register static chain; this push is 1 byte. */
25334 offset += 5;
25335 disp = expand_binop (SImode, sub_optab, fnaddr,
25336 plus_constant (Pmode, XEXP (m_tramp, 0),
25337 offset - (MEM_P (chain) ? 1 : 0)),
25338 NULL_RTX, 1, OPTAB_DIRECT);
25339 emit_move_insn (mem, disp);
25342 gcc_assert (offset <= TRAMPOLINE_SIZE);
25344 #ifdef HAVE_ENABLE_EXECUTE_STACK
25345 #ifdef CHECK_EXECUTE_STACK_ENABLED
25346 if (CHECK_EXECUTE_STACK_ENABLED)
25347 #endif
25348 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
25349 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
25350 #endif
25353 /* The following file contains several enumerations and data structures
25354 built from the definitions in i386-builtin-types.def. */
25356 #include "i386-builtin-types.inc"
25358 /* Table for the ix86 builtin non-function types. */
25359 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
25361 /* Retrieve an element from the above table, building some of
25362 the types lazily. */
25364 static tree
25365 ix86_get_builtin_type (enum ix86_builtin_type tcode)
25367 unsigned int index;
25368 tree type, itype;
25370 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
25372 type = ix86_builtin_type_tab[(int) tcode];
25373 if (type != NULL)
25374 return type;
25376 gcc_assert (tcode > IX86_BT_LAST_PRIM);
25377 if (tcode <= IX86_BT_LAST_VECT)
25379 enum machine_mode mode;
25381 index = tcode - IX86_BT_LAST_PRIM - 1;
25382 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
25383 mode = ix86_builtin_type_vect_mode[index];
25385 type = build_vector_type_for_mode (itype, mode);
25387 else
25389 int quals;
25391 index = tcode - IX86_BT_LAST_VECT - 1;
25392 if (tcode <= IX86_BT_LAST_PTR)
25393 quals = TYPE_UNQUALIFIED;
25394 else
25395 quals = TYPE_QUAL_CONST;
25397 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
25398 if (quals != TYPE_UNQUALIFIED)
25399 itype = build_qualified_type (itype, quals);
25401 type = build_pointer_type (itype);
25404 ix86_builtin_type_tab[(int) tcode] = type;
25405 return type;
25408 /* Table for the ix86 builtin function types. */
25409 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
25411 /* Retrieve an element from the above table, building some of
25412 the types lazily. */
25414 static tree
25415 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
25417 tree type;
25419 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
25421 type = ix86_builtin_func_type_tab[(int) tcode];
25422 if (type != NULL)
25423 return type;
25425 if (tcode <= IX86_BT_LAST_FUNC)
25427 unsigned start = ix86_builtin_func_start[(int) tcode];
25428 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
25429 tree rtype, atype, args = void_list_node;
25430 unsigned i;
25432 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
25433 for (i = after - 1; i > start; --i)
25435 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
25436 args = tree_cons (NULL, atype, args);
25439 type = build_function_type (rtype, args);
25441 else
25443 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
25444 enum ix86_builtin_func_type icode;
25446 icode = ix86_builtin_func_alias_base[index];
25447 type = ix86_get_builtin_func_type (icode);
25450 ix86_builtin_func_type_tab[(int) tcode] = type;
25451 return type;
25455 /* Codes for all the SSE/MMX builtins. */
25456 enum ix86_builtins
25458 IX86_BUILTIN_ADDPS,
25459 IX86_BUILTIN_ADDSS,
25460 IX86_BUILTIN_DIVPS,
25461 IX86_BUILTIN_DIVSS,
25462 IX86_BUILTIN_MULPS,
25463 IX86_BUILTIN_MULSS,
25464 IX86_BUILTIN_SUBPS,
25465 IX86_BUILTIN_SUBSS,
25467 IX86_BUILTIN_CMPEQPS,
25468 IX86_BUILTIN_CMPLTPS,
25469 IX86_BUILTIN_CMPLEPS,
25470 IX86_BUILTIN_CMPGTPS,
25471 IX86_BUILTIN_CMPGEPS,
25472 IX86_BUILTIN_CMPNEQPS,
25473 IX86_BUILTIN_CMPNLTPS,
25474 IX86_BUILTIN_CMPNLEPS,
25475 IX86_BUILTIN_CMPNGTPS,
25476 IX86_BUILTIN_CMPNGEPS,
25477 IX86_BUILTIN_CMPORDPS,
25478 IX86_BUILTIN_CMPUNORDPS,
25479 IX86_BUILTIN_CMPEQSS,
25480 IX86_BUILTIN_CMPLTSS,
25481 IX86_BUILTIN_CMPLESS,
25482 IX86_BUILTIN_CMPNEQSS,
25483 IX86_BUILTIN_CMPNLTSS,
25484 IX86_BUILTIN_CMPNLESS,
25485 IX86_BUILTIN_CMPNGTSS,
25486 IX86_BUILTIN_CMPNGESS,
25487 IX86_BUILTIN_CMPORDSS,
25488 IX86_BUILTIN_CMPUNORDSS,
25490 IX86_BUILTIN_COMIEQSS,
25491 IX86_BUILTIN_COMILTSS,
25492 IX86_BUILTIN_COMILESS,
25493 IX86_BUILTIN_COMIGTSS,
25494 IX86_BUILTIN_COMIGESS,
25495 IX86_BUILTIN_COMINEQSS,
25496 IX86_BUILTIN_UCOMIEQSS,
25497 IX86_BUILTIN_UCOMILTSS,
25498 IX86_BUILTIN_UCOMILESS,
25499 IX86_BUILTIN_UCOMIGTSS,
25500 IX86_BUILTIN_UCOMIGESS,
25501 IX86_BUILTIN_UCOMINEQSS,
25503 IX86_BUILTIN_CVTPI2PS,
25504 IX86_BUILTIN_CVTPS2PI,
25505 IX86_BUILTIN_CVTSI2SS,
25506 IX86_BUILTIN_CVTSI642SS,
25507 IX86_BUILTIN_CVTSS2SI,
25508 IX86_BUILTIN_CVTSS2SI64,
25509 IX86_BUILTIN_CVTTPS2PI,
25510 IX86_BUILTIN_CVTTSS2SI,
25511 IX86_BUILTIN_CVTTSS2SI64,
25513 IX86_BUILTIN_MAXPS,
25514 IX86_BUILTIN_MAXSS,
25515 IX86_BUILTIN_MINPS,
25516 IX86_BUILTIN_MINSS,
25518 IX86_BUILTIN_LOADUPS,
25519 IX86_BUILTIN_STOREUPS,
25520 IX86_BUILTIN_MOVSS,
25522 IX86_BUILTIN_MOVHLPS,
25523 IX86_BUILTIN_MOVLHPS,
25524 IX86_BUILTIN_LOADHPS,
25525 IX86_BUILTIN_LOADLPS,
25526 IX86_BUILTIN_STOREHPS,
25527 IX86_BUILTIN_STORELPS,
25529 IX86_BUILTIN_MASKMOVQ,
25530 IX86_BUILTIN_MOVMSKPS,
25531 IX86_BUILTIN_PMOVMSKB,
25533 IX86_BUILTIN_MOVNTPS,
25534 IX86_BUILTIN_MOVNTQ,
25536 IX86_BUILTIN_LOADDQU,
25537 IX86_BUILTIN_STOREDQU,
25539 IX86_BUILTIN_PACKSSWB,
25540 IX86_BUILTIN_PACKSSDW,
25541 IX86_BUILTIN_PACKUSWB,
25543 IX86_BUILTIN_PADDB,
25544 IX86_BUILTIN_PADDW,
25545 IX86_BUILTIN_PADDD,
25546 IX86_BUILTIN_PADDQ,
25547 IX86_BUILTIN_PADDSB,
25548 IX86_BUILTIN_PADDSW,
25549 IX86_BUILTIN_PADDUSB,
25550 IX86_BUILTIN_PADDUSW,
25551 IX86_BUILTIN_PSUBB,
25552 IX86_BUILTIN_PSUBW,
25553 IX86_BUILTIN_PSUBD,
25554 IX86_BUILTIN_PSUBQ,
25555 IX86_BUILTIN_PSUBSB,
25556 IX86_BUILTIN_PSUBSW,
25557 IX86_BUILTIN_PSUBUSB,
25558 IX86_BUILTIN_PSUBUSW,
25560 IX86_BUILTIN_PAND,
25561 IX86_BUILTIN_PANDN,
25562 IX86_BUILTIN_POR,
25563 IX86_BUILTIN_PXOR,
25565 IX86_BUILTIN_PAVGB,
25566 IX86_BUILTIN_PAVGW,
25568 IX86_BUILTIN_PCMPEQB,
25569 IX86_BUILTIN_PCMPEQW,
25570 IX86_BUILTIN_PCMPEQD,
25571 IX86_BUILTIN_PCMPGTB,
25572 IX86_BUILTIN_PCMPGTW,
25573 IX86_BUILTIN_PCMPGTD,
25575 IX86_BUILTIN_PMADDWD,
25577 IX86_BUILTIN_PMAXSW,
25578 IX86_BUILTIN_PMAXUB,
25579 IX86_BUILTIN_PMINSW,
25580 IX86_BUILTIN_PMINUB,
25582 IX86_BUILTIN_PMULHUW,
25583 IX86_BUILTIN_PMULHW,
25584 IX86_BUILTIN_PMULLW,
25586 IX86_BUILTIN_PSADBW,
25587 IX86_BUILTIN_PSHUFW,
25589 IX86_BUILTIN_PSLLW,
25590 IX86_BUILTIN_PSLLD,
25591 IX86_BUILTIN_PSLLQ,
25592 IX86_BUILTIN_PSRAW,
25593 IX86_BUILTIN_PSRAD,
25594 IX86_BUILTIN_PSRLW,
25595 IX86_BUILTIN_PSRLD,
25596 IX86_BUILTIN_PSRLQ,
25597 IX86_BUILTIN_PSLLWI,
25598 IX86_BUILTIN_PSLLDI,
25599 IX86_BUILTIN_PSLLQI,
25600 IX86_BUILTIN_PSRAWI,
25601 IX86_BUILTIN_PSRADI,
25602 IX86_BUILTIN_PSRLWI,
25603 IX86_BUILTIN_PSRLDI,
25604 IX86_BUILTIN_PSRLQI,
25606 IX86_BUILTIN_PUNPCKHBW,
25607 IX86_BUILTIN_PUNPCKHWD,
25608 IX86_BUILTIN_PUNPCKHDQ,
25609 IX86_BUILTIN_PUNPCKLBW,
25610 IX86_BUILTIN_PUNPCKLWD,
25611 IX86_BUILTIN_PUNPCKLDQ,
25613 IX86_BUILTIN_SHUFPS,
25615 IX86_BUILTIN_RCPPS,
25616 IX86_BUILTIN_RCPSS,
25617 IX86_BUILTIN_RSQRTPS,
25618 IX86_BUILTIN_RSQRTPS_NR,
25619 IX86_BUILTIN_RSQRTSS,
25620 IX86_BUILTIN_RSQRTF,
25621 IX86_BUILTIN_SQRTPS,
25622 IX86_BUILTIN_SQRTPS_NR,
25623 IX86_BUILTIN_SQRTSS,
25625 IX86_BUILTIN_UNPCKHPS,
25626 IX86_BUILTIN_UNPCKLPS,
25628 IX86_BUILTIN_ANDPS,
25629 IX86_BUILTIN_ANDNPS,
25630 IX86_BUILTIN_ORPS,
25631 IX86_BUILTIN_XORPS,
25633 IX86_BUILTIN_EMMS,
25634 IX86_BUILTIN_LDMXCSR,
25635 IX86_BUILTIN_STMXCSR,
25636 IX86_BUILTIN_SFENCE,
25638 IX86_BUILTIN_FXSAVE,
25639 IX86_BUILTIN_FXRSTOR,
25640 IX86_BUILTIN_FXSAVE64,
25641 IX86_BUILTIN_FXRSTOR64,
25643 IX86_BUILTIN_XSAVE,
25644 IX86_BUILTIN_XRSTOR,
25645 IX86_BUILTIN_XSAVE64,
25646 IX86_BUILTIN_XRSTOR64,
25648 IX86_BUILTIN_XSAVEOPT,
25649 IX86_BUILTIN_XSAVEOPT64,
25651 /* 3DNow! Original */
25652 IX86_BUILTIN_FEMMS,
25653 IX86_BUILTIN_PAVGUSB,
25654 IX86_BUILTIN_PF2ID,
25655 IX86_BUILTIN_PFACC,
25656 IX86_BUILTIN_PFADD,
25657 IX86_BUILTIN_PFCMPEQ,
25658 IX86_BUILTIN_PFCMPGE,
25659 IX86_BUILTIN_PFCMPGT,
25660 IX86_BUILTIN_PFMAX,
25661 IX86_BUILTIN_PFMIN,
25662 IX86_BUILTIN_PFMUL,
25663 IX86_BUILTIN_PFRCP,
25664 IX86_BUILTIN_PFRCPIT1,
25665 IX86_BUILTIN_PFRCPIT2,
25666 IX86_BUILTIN_PFRSQIT1,
25667 IX86_BUILTIN_PFRSQRT,
25668 IX86_BUILTIN_PFSUB,
25669 IX86_BUILTIN_PFSUBR,
25670 IX86_BUILTIN_PI2FD,
25671 IX86_BUILTIN_PMULHRW,
25673 /* 3DNow! Athlon Extensions */
25674 IX86_BUILTIN_PF2IW,
25675 IX86_BUILTIN_PFNACC,
25676 IX86_BUILTIN_PFPNACC,
25677 IX86_BUILTIN_PI2FW,
25678 IX86_BUILTIN_PSWAPDSI,
25679 IX86_BUILTIN_PSWAPDSF,
25681 /* SSE2 */
25682 IX86_BUILTIN_ADDPD,
25683 IX86_BUILTIN_ADDSD,
25684 IX86_BUILTIN_DIVPD,
25685 IX86_BUILTIN_DIVSD,
25686 IX86_BUILTIN_MULPD,
25687 IX86_BUILTIN_MULSD,
25688 IX86_BUILTIN_SUBPD,
25689 IX86_BUILTIN_SUBSD,
25691 IX86_BUILTIN_CMPEQPD,
25692 IX86_BUILTIN_CMPLTPD,
25693 IX86_BUILTIN_CMPLEPD,
25694 IX86_BUILTIN_CMPGTPD,
25695 IX86_BUILTIN_CMPGEPD,
25696 IX86_BUILTIN_CMPNEQPD,
25697 IX86_BUILTIN_CMPNLTPD,
25698 IX86_BUILTIN_CMPNLEPD,
25699 IX86_BUILTIN_CMPNGTPD,
25700 IX86_BUILTIN_CMPNGEPD,
25701 IX86_BUILTIN_CMPORDPD,
25702 IX86_BUILTIN_CMPUNORDPD,
25703 IX86_BUILTIN_CMPEQSD,
25704 IX86_BUILTIN_CMPLTSD,
25705 IX86_BUILTIN_CMPLESD,
25706 IX86_BUILTIN_CMPNEQSD,
25707 IX86_BUILTIN_CMPNLTSD,
25708 IX86_BUILTIN_CMPNLESD,
25709 IX86_BUILTIN_CMPORDSD,
25710 IX86_BUILTIN_CMPUNORDSD,
25712 IX86_BUILTIN_COMIEQSD,
25713 IX86_BUILTIN_COMILTSD,
25714 IX86_BUILTIN_COMILESD,
25715 IX86_BUILTIN_COMIGTSD,
25716 IX86_BUILTIN_COMIGESD,
25717 IX86_BUILTIN_COMINEQSD,
25718 IX86_BUILTIN_UCOMIEQSD,
25719 IX86_BUILTIN_UCOMILTSD,
25720 IX86_BUILTIN_UCOMILESD,
25721 IX86_BUILTIN_UCOMIGTSD,
25722 IX86_BUILTIN_UCOMIGESD,
25723 IX86_BUILTIN_UCOMINEQSD,
25725 IX86_BUILTIN_MAXPD,
25726 IX86_BUILTIN_MAXSD,
25727 IX86_BUILTIN_MINPD,
25728 IX86_BUILTIN_MINSD,
25730 IX86_BUILTIN_ANDPD,
25731 IX86_BUILTIN_ANDNPD,
25732 IX86_BUILTIN_ORPD,
25733 IX86_BUILTIN_XORPD,
25735 IX86_BUILTIN_SQRTPD,
25736 IX86_BUILTIN_SQRTSD,
25738 IX86_BUILTIN_UNPCKHPD,
25739 IX86_BUILTIN_UNPCKLPD,
25741 IX86_BUILTIN_SHUFPD,
25743 IX86_BUILTIN_LOADUPD,
25744 IX86_BUILTIN_STOREUPD,
25745 IX86_BUILTIN_MOVSD,
25747 IX86_BUILTIN_LOADHPD,
25748 IX86_BUILTIN_LOADLPD,
25750 IX86_BUILTIN_CVTDQ2PD,
25751 IX86_BUILTIN_CVTDQ2PS,
25753 IX86_BUILTIN_CVTPD2DQ,
25754 IX86_BUILTIN_CVTPD2PI,
25755 IX86_BUILTIN_CVTPD2PS,
25756 IX86_BUILTIN_CVTTPD2DQ,
25757 IX86_BUILTIN_CVTTPD2PI,
25759 IX86_BUILTIN_CVTPI2PD,
25760 IX86_BUILTIN_CVTSI2SD,
25761 IX86_BUILTIN_CVTSI642SD,
25763 IX86_BUILTIN_CVTSD2SI,
25764 IX86_BUILTIN_CVTSD2SI64,
25765 IX86_BUILTIN_CVTSD2SS,
25766 IX86_BUILTIN_CVTSS2SD,
25767 IX86_BUILTIN_CVTTSD2SI,
25768 IX86_BUILTIN_CVTTSD2SI64,
25770 IX86_BUILTIN_CVTPS2DQ,
25771 IX86_BUILTIN_CVTPS2PD,
25772 IX86_BUILTIN_CVTTPS2DQ,
25774 IX86_BUILTIN_MOVNTI,
25775 IX86_BUILTIN_MOVNTI64,
25776 IX86_BUILTIN_MOVNTPD,
25777 IX86_BUILTIN_MOVNTDQ,
25779 IX86_BUILTIN_MOVQ128,
25781 /* SSE2 MMX */
25782 IX86_BUILTIN_MASKMOVDQU,
25783 IX86_BUILTIN_MOVMSKPD,
25784 IX86_BUILTIN_PMOVMSKB128,
25786 IX86_BUILTIN_PACKSSWB128,
25787 IX86_BUILTIN_PACKSSDW128,
25788 IX86_BUILTIN_PACKUSWB128,
25790 IX86_BUILTIN_PADDB128,
25791 IX86_BUILTIN_PADDW128,
25792 IX86_BUILTIN_PADDD128,
25793 IX86_BUILTIN_PADDQ128,
25794 IX86_BUILTIN_PADDSB128,
25795 IX86_BUILTIN_PADDSW128,
25796 IX86_BUILTIN_PADDUSB128,
25797 IX86_BUILTIN_PADDUSW128,
25798 IX86_BUILTIN_PSUBB128,
25799 IX86_BUILTIN_PSUBW128,
25800 IX86_BUILTIN_PSUBD128,
25801 IX86_BUILTIN_PSUBQ128,
25802 IX86_BUILTIN_PSUBSB128,
25803 IX86_BUILTIN_PSUBSW128,
25804 IX86_BUILTIN_PSUBUSB128,
25805 IX86_BUILTIN_PSUBUSW128,
25807 IX86_BUILTIN_PAND128,
25808 IX86_BUILTIN_PANDN128,
25809 IX86_BUILTIN_POR128,
25810 IX86_BUILTIN_PXOR128,
25812 IX86_BUILTIN_PAVGB128,
25813 IX86_BUILTIN_PAVGW128,
25815 IX86_BUILTIN_PCMPEQB128,
25816 IX86_BUILTIN_PCMPEQW128,
25817 IX86_BUILTIN_PCMPEQD128,
25818 IX86_BUILTIN_PCMPGTB128,
25819 IX86_BUILTIN_PCMPGTW128,
25820 IX86_BUILTIN_PCMPGTD128,
25822 IX86_BUILTIN_PMADDWD128,
25824 IX86_BUILTIN_PMAXSW128,
25825 IX86_BUILTIN_PMAXUB128,
25826 IX86_BUILTIN_PMINSW128,
25827 IX86_BUILTIN_PMINUB128,
25829 IX86_BUILTIN_PMULUDQ,
25830 IX86_BUILTIN_PMULUDQ128,
25831 IX86_BUILTIN_PMULHUW128,
25832 IX86_BUILTIN_PMULHW128,
25833 IX86_BUILTIN_PMULLW128,
25835 IX86_BUILTIN_PSADBW128,
25836 IX86_BUILTIN_PSHUFHW,
25837 IX86_BUILTIN_PSHUFLW,
25838 IX86_BUILTIN_PSHUFD,
25840 IX86_BUILTIN_PSLLDQI128,
25841 IX86_BUILTIN_PSLLWI128,
25842 IX86_BUILTIN_PSLLDI128,
25843 IX86_BUILTIN_PSLLQI128,
25844 IX86_BUILTIN_PSRAWI128,
25845 IX86_BUILTIN_PSRADI128,
25846 IX86_BUILTIN_PSRLDQI128,
25847 IX86_BUILTIN_PSRLWI128,
25848 IX86_BUILTIN_PSRLDI128,
25849 IX86_BUILTIN_PSRLQI128,
25851 IX86_BUILTIN_PSLLDQ128,
25852 IX86_BUILTIN_PSLLW128,
25853 IX86_BUILTIN_PSLLD128,
25854 IX86_BUILTIN_PSLLQ128,
25855 IX86_BUILTIN_PSRAW128,
25856 IX86_BUILTIN_PSRAD128,
25857 IX86_BUILTIN_PSRLW128,
25858 IX86_BUILTIN_PSRLD128,
25859 IX86_BUILTIN_PSRLQ128,
25861 IX86_BUILTIN_PUNPCKHBW128,
25862 IX86_BUILTIN_PUNPCKHWD128,
25863 IX86_BUILTIN_PUNPCKHDQ128,
25864 IX86_BUILTIN_PUNPCKHQDQ128,
25865 IX86_BUILTIN_PUNPCKLBW128,
25866 IX86_BUILTIN_PUNPCKLWD128,
25867 IX86_BUILTIN_PUNPCKLDQ128,
25868 IX86_BUILTIN_PUNPCKLQDQ128,
25870 IX86_BUILTIN_CLFLUSH,
25871 IX86_BUILTIN_MFENCE,
25872 IX86_BUILTIN_LFENCE,
25873 IX86_BUILTIN_PAUSE,
25875 IX86_BUILTIN_BSRSI,
25876 IX86_BUILTIN_BSRDI,
25877 IX86_BUILTIN_RDPMC,
25878 IX86_BUILTIN_RDTSC,
25879 IX86_BUILTIN_RDTSCP,
25880 IX86_BUILTIN_ROLQI,
25881 IX86_BUILTIN_ROLHI,
25882 IX86_BUILTIN_RORQI,
25883 IX86_BUILTIN_RORHI,
25885 /* SSE3. */
25886 IX86_BUILTIN_ADDSUBPS,
25887 IX86_BUILTIN_HADDPS,
25888 IX86_BUILTIN_HSUBPS,
25889 IX86_BUILTIN_MOVSHDUP,
25890 IX86_BUILTIN_MOVSLDUP,
25891 IX86_BUILTIN_ADDSUBPD,
25892 IX86_BUILTIN_HADDPD,
25893 IX86_BUILTIN_HSUBPD,
25894 IX86_BUILTIN_LDDQU,
25896 IX86_BUILTIN_MONITOR,
25897 IX86_BUILTIN_MWAIT,
25899 /* SSSE3. */
25900 IX86_BUILTIN_PHADDW,
25901 IX86_BUILTIN_PHADDD,
25902 IX86_BUILTIN_PHADDSW,
25903 IX86_BUILTIN_PHSUBW,
25904 IX86_BUILTIN_PHSUBD,
25905 IX86_BUILTIN_PHSUBSW,
25906 IX86_BUILTIN_PMADDUBSW,
25907 IX86_BUILTIN_PMULHRSW,
25908 IX86_BUILTIN_PSHUFB,
25909 IX86_BUILTIN_PSIGNB,
25910 IX86_BUILTIN_PSIGNW,
25911 IX86_BUILTIN_PSIGND,
25912 IX86_BUILTIN_PALIGNR,
25913 IX86_BUILTIN_PABSB,
25914 IX86_BUILTIN_PABSW,
25915 IX86_BUILTIN_PABSD,
25917 IX86_BUILTIN_PHADDW128,
25918 IX86_BUILTIN_PHADDD128,
25919 IX86_BUILTIN_PHADDSW128,
25920 IX86_BUILTIN_PHSUBW128,
25921 IX86_BUILTIN_PHSUBD128,
25922 IX86_BUILTIN_PHSUBSW128,
25923 IX86_BUILTIN_PMADDUBSW128,
25924 IX86_BUILTIN_PMULHRSW128,
25925 IX86_BUILTIN_PSHUFB128,
25926 IX86_BUILTIN_PSIGNB128,
25927 IX86_BUILTIN_PSIGNW128,
25928 IX86_BUILTIN_PSIGND128,
25929 IX86_BUILTIN_PALIGNR128,
25930 IX86_BUILTIN_PABSB128,
25931 IX86_BUILTIN_PABSW128,
25932 IX86_BUILTIN_PABSD128,
25934 /* AMDFAM10 - SSE4A New Instructions. */
25935 IX86_BUILTIN_MOVNTSD,
25936 IX86_BUILTIN_MOVNTSS,
25937 IX86_BUILTIN_EXTRQI,
25938 IX86_BUILTIN_EXTRQ,
25939 IX86_BUILTIN_INSERTQI,
25940 IX86_BUILTIN_INSERTQ,
25942 /* SSE4.1. */
25943 IX86_BUILTIN_BLENDPD,
25944 IX86_BUILTIN_BLENDPS,
25945 IX86_BUILTIN_BLENDVPD,
25946 IX86_BUILTIN_BLENDVPS,
25947 IX86_BUILTIN_PBLENDVB128,
25948 IX86_BUILTIN_PBLENDW128,
25950 IX86_BUILTIN_DPPD,
25951 IX86_BUILTIN_DPPS,
25953 IX86_BUILTIN_INSERTPS128,
25955 IX86_BUILTIN_MOVNTDQA,
25956 IX86_BUILTIN_MPSADBW128,
25957 IX86_BUILTIN_PACKUSDW128,
25958 IX86_BUILTIN_PCMPEQQ,
25959 IX86_BUILTIN_PHMINPOSUW128,
25961 IX86_BUILTIN_PMAXSB128,
25962 IX86_BUILTIN_PMAXSD128,
25963 IX86_BUILTIN_PMAXUD128,
25964 IX86_BUILTIN_PMAXUW128,
25966 IX86_BUILTIN_PMINSB128,
25967 IX86_BUILTIN_PMINSD128,
25968 IX86_BUILTIN_PMINUD128,
25969 IX86_BUILTIN_PMINUW128,
25971 IX86_BUILTIN_PMOVSXBW128,
25972 IX86_BUILTIN_PMOVSXBD128,
25973 IX86_BUILTIN_PMOVSXBQ128,
25974 IX86_BUILTIN_PMOVSXWD128,
25975 IX86_BUILTIN_PMOVSXWQ128,
25976 IX86_BUILTIN_PMOVSXDQ128,
25978 IX86_BUILTIN_PMOVZXBW128,
25979 IX86_BUILTIN_PMOVZXBD128,
25980 IX86_BUILTIN_PMOVZXBQ128,
25981 IX86_BUILTIN_PMOVZXWD128,
25982 IX86_BUILTIN_PMOVZXWQ128,
25983 IX86_BUILTIN_PMOVZXDQ128,
25985 IX86_BUILTIN_PMULDQ128,
25986 IX86_BUILTIN_PMULLD128,
25988 IX86_BUILTIN_ROUNDSD,
25989 IX86_BUILTIN_ROUNDSS,
25991 IX86_BUILTIN_ROUNDPD,
25992 IX86_BUILTIN_ROUNDPS,
25994 IX86_BUILTIN_FLOORPD,
25995 IX86_BUILTIN_CEILPD,
25996 IX86_BUILTIN_TRUNCPD,
25997 IX86_BUILTIN_RINTPD,
25998 IX86_BUILTIN_ROUNDPD_AZ,
26000 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX,
26001 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX,
26002 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX,
26004 IX86_BUILTIN_FLOORPS,
26005 IX86_BUILTIN_CEILPS,
26006 IX86_BUILTIN_TRUNCPS,
26007 IX86_BUILTIN_RINTPS,
26008 IX86_BUILTIN_ROUNDPS_AZ,
26010 IX86_BUILTIN_FLOORPS_SFIX,
26011 IX86_BUILTIN_CEILPS_SFIX,
26012 IX86_BUILTIN_ROUNDPS_AZ_SFIX,
26014 IX86_BUILTIN_PTESTZ,
26015 IX86_BUILTIN_PTESTC,
26016 IX86_BUILTIN_PTESTNZC,
26018 IX86_BUILTIN_VEC_INIT_V2SI,
26019 IX86_BUILTIN_VEC_INIT_V4HI,
26020 IX86_BUILTIN_VEC_INIT_V8QI,
26021 IX86_BUILTIN_VEC_EXT_V2DF,
26022 IX86_BUILTIN_VEC_EXT_V2DI,
26023 IX86_BUILTIN_VEC_EXT_V4SF,
26024 IX86_BUILTIN_VEC_EXT_V4SI,
26025 IX86_BUILTIN_VEC_EXT_V8HI,
26026 IX86_BUILTIN_VEC_EXT_V2SI,
26027 IX86_BUILTIN_VEC_EXT_V4HI,
26028 IX86_BUILTIN_VEC_EXT_V16QI,
26029 IX86_BUILTIN_VEC_SET_V2DI,
26030 IX86_BUILTIN_VEC_SET_V4SF,
26031 IX86_BUILTIN_VEC_SET_V4SI,
26032 IX86_BUILTIN_VEC_SET_V8HI,
26033 IX86_BUILTIN_VEC_SET_V4HI,
26034 IX86_BUILTIN_VEC_SET_V16QI,
26036 IX86_BUILTIN_VEC_PACK_SFIX,
26037 IX86_BUILTIN_VEC_PACK_SFIX256,
26039 /* SSE4.2. */
26040 IX86_BUILTIN_CRC32QI,
26041 IX86_BUILTIN_CRC32HI,
26042 IX86_BUILTIN_CRC32SI,
26043 IX86_BUILTIN_CRC32DI,
26045 IX86_BUILTIN_PCMPESTRI128,
26046 IX86_BUILTIN_PCMPESTRM128,
26047 IX86_BUILTIN_PCMPESTRA128,
26048 IX86_BUILTIN_PCMPESTRC128,
26049 IX86_BUILTIN_PCMPESTRO128,
26050 IX86_BUILTIN_PCMPESTRS128,
26051 IX86_BUILTIN_PCMPESTRZ128,
26052 IX86_BUILTIN_PCMPISTRI128,
26053 IX86_BUILTIN_PCMPISTRM128,
26054 IX86_BUILTIN_PCMPISTRA128,
26055 IX86_BUILTIN_PCMPISTRC128,
26056 IX86_BUILTIN_PCMPISTRO128,
26057 IX86_BUILTIN_PCMPISTRS128,
26058 IX86_BUILTIN_PCMPISTRZ128,
26060 IX86_BUILTIN_PCMPGTQ,
26062 /* AES instructions */
26063 IX86_BUILTIN_AESENC128,
26064 IX86_BUILTIN_AESENCLAST128,
26065 IX86_BUILTIN_AESDEC128,
26066 IX86_BUILTIN_AESDECLAST128,
26067 IX86_BUILTIN_AESIMC128,
26068 IX86_BUILTIN_AESKEYGENASSIST128,
26070 /* PCLMUL instruction */
26071 IX86_BUILTIN_PCLMULQDQ128,
26073 /* AVX */
26074 IX86_BUILTIN_ADDPD256,
26075 IX86_BUILTIN_ADDPS256,
26076 IX86_BUILTIN_ADDSUBPD256,
26077 IX86_BUILTIN_ADDSUBPS256,
26078 IX86_BUILTIN_ANDPD256,
26079 IX86_BUILTIN_ANDPS256,
26080 IX86_BUILTIN_ANDNPD256,
26081 IX86_BUILTIN_ANDNPS256,
26082 IX86_BUILTIN_BLENDPD256,
26083 IX86_BUILTIN_BLENDPS256,
26084 IX86_BUILTIN_BLENDVPD256,
26085 IX86_BUILTIN_BLENDVPS256,
26086 IX86_BUILTIN_DIVPD256,
26087 IX86_BUILTIN_DIVPS256,
26088 IX86_BUILTIN_DPPS256,
26089 IX86_BUILTIN_HADDPD256,
26090 IX86_BUILTIN_HADDPS256,
26091 IX86_BUILTIN_HSUBPD256,
26092 IX86_BUILTIN_HSUBPS256,
26093 IX86_BUILTIN_MAXPD256,
26094 IX86_BUILTIN_MAXPS256,
26095 IX86_BUILTIN_MINPD256,
26096 IX86_BUILTIN_MINPS256,
26097 IX86_BUILTIN_MULPD256,
26098 IX86_BUILTIN_MULPS256,
26099 IX86_BUILTIN_ORPD256,
26100 IX86_BUILTIN_ORPS256,
26101 IX86_BUILTIN_SHUFPD256,
26102 IX86_BUILTIN_SHUFPS256,
26103 IX86_BUILTIN_SUBPD256,
26104 IX86_BUILTIN_SUBPS256,
26105 IX86_BUILTIN_XORPD256,
26106 IX86_BUILTIN_XORPS256,
26107 IX86_BUILTIN_CMPSD,
26108 IX86_BUILTIN_CMPSS,
26109 IX86_BUILTIN_CMPPD,
26110 IX86_BUILTIN_CMPPS,
26111 IX86_BUILTIN_CMPPD256,
26112 IX86_BUILTIN_CMPPS256,
26113 IX86_BUILTIN_CVTDQ2PD256,
26114 IX86_BUILTIN_CVTDQ2PS256,
26115 IX86_BUILTIN_CVTPD2PS256,
26116 IX86_BUILTIN_CVTPS2DQ256,
26117 IX86_BUILTIN_CVTPS2PD256,
26118 IX86_BUILTIN_CVTTPD2DQ256,
26119 IX86_BUILTIN_CVTPD2DQ256,
26120 IX86_BUILTIN_CVTTPS2DQ256,
26121 IX86_BUILTIN_EXTRACTF128PD256,
26122 IX86_BUILTIN_EXTRACTF128PS256,
26123 IX86_BUILTIN_EXTRACTF128SI256,
26124 IX86_BUILTIN_VZEROALL,
26125 IX86_BUILTIN_VZEROUPPER,
26126 IX86_BUILTIN_VPERMILVARPD,
26127 IX86_BUILTIN_VPERMILVARPS,
26128 IX86_BUILTIN_VPERMILVARPD256,
26129 IX86_BUILTIN_VPERMILVARPS256,
26130 IX86_BUILTIN_VPERMILPD,
26131 IX86_BUILTIN_VPERMILPS,
26132 IX86_BUILTIN_VPERMILPD256,
26133 IX86_BUILTIN_VPERMILPS256,
26134 IX86_BUILTIN_VPERMIL2PD,
26135 IX86_BUILTIN_VPERMIL2PS,
26136 IX86_BUILTIN_VPERMIL2PD256,
26137 IX86_BUILTIN_VPERMIL2PS256,
26138 IX86_BUILTIN_VPERM2F128PD256,
26139 IX86_BUILTIN_VPERM2F128PS256,
26140 IX86_BUILTIN_VPERM2F128SI256,
26141 IX86_BUILTIN_VBROADCASTSS,
26142 IX86_BUILTIN_VBROADCASTSD256,
26143 IX86_BUILTIN_VBROADCASTSS256,
26144 IX86_BUILTIN_VBROADCASTPD256,
26145 IX86_BUILTIN_VBROADCASTPS256,
26146 IX86_BUILTIN_VINSERTF128PD256,
26147 IX86_BUILTIN_VINSERTF128PS256,
26148 IX86_BUILTIN_VINSERTF128SI256,
26149 IX86_BUILTIN_LOADUPD256,
26150 IX86_BUILTIN_LOADUPS256,
26151 IX86_BUILTIN_STOREUPD256,
26152 IX86_BUILTIN_STOREUPS256,
26153 IX86_BUILTIN_LDDQU256,
26154 IX86_BUILTIN_MOVNTDQ256,
26155 IX86_BUILTIN_MOVNTPD256,
26156 IX86_BUILTIN_MOVNTPS256,
26157 IX86_BUILTIN_LOADDQU256,
26158 IX86_BUILTIN_STOREDQU256,
26159 IX86_BUILTIN_MASKLOADPD,
26160 IX86_BUILTIN_MASKLOADPS,
26161 IX86_BUILTIN_MASKSTOREPD,
26162 IX86_BUILTIN_MASKSTOREPS,
26163 IX86_BUILTIN_MASKLOADPD256,
26164 IX86_BUILTIN_MASKLOADPS256,
26165 IX86_BUILTIN_MASKSTOREPD256,
26166 IX86_BUILTIN_MASKSTOREPS256,
26167 IX86_BUILTIN_MOVSHDUP256,
26168 IX86_BUILTIN_MOVSLDUP256,
26169 IX86_BUILTIN_MOVDDUP256,
26171 IX86_BUILTIN_SQRTPD256,
26172 IX86_BUILTIN_SQRTPS256,
26173 IX86_BUILTIN_SQRTPS_NR256,
26174 IX86_BUILTIN_RSQRTPS256,
26175 IX86_BUILTIN_RSQRTPS_NR256,
26177 IX86_BUILTIN_RCPPS256,
26179 IX86_BUILTIN_ROUNDPD256,
26180 IX86_BUILTIN_ROUNDPS256,
26182 IX86_BUILTIN_FLOORPD256,
26183 IX86_BUILTIN_CEILPD256,
26184 IX86_BUILTIN_TRUNCPD256,
26185 IX86_BUILTIN_RINTPD256,
26186 IX86_BUILTIN_ROUNDPD_AZ256,
26188 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256,
26189 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256,
26190 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256,
26192 IX86_BUILTIN_FLOORPS256,
26193 IX86_BUILTIN_CEILPS256,
26194 IX86_BUILTIN_TRUNCPS256,
26195 IX86_BUILTIN_RINTPS256,
26196 IX86_BUILTIN_ROUNDPS_AZ256,
26198 IX86_BUILTIN_FLOORPS_SFIX256,
26199 IX86_BUILTIN_CEILPS_SFIX256,
26200 IX86_BUILTIN_ROUNDPS_AZ_SFIX256,
26202 IX86_BUILTIN_UNPCKHPD256,
26203 IX86_BUILTIN_UNPCKLPD256,
26204 IX86_BUILTIN_UNPCKHPS256,
26205 IX86_BUILTIN_UNPCKLPS256,
26207 IX86_BUILTIN_SI256_SI,
26208 IX86_BUILTIN_PS256_PS,
26209 IX86_BUILTIN_PD256_PD,
26210 IX86_BUILTIN_SI_SI256,
26211 IX86_BUILTIN_PS_PS256,
26212 IX86_BUILTIN_PD_PD256,
26214 IX86_BUILTIN_VTESTZPD,
26215 IX86_BUILTIN_VTESTCPD,
26216 IX86_BUILTIN_VTESTNZCPD,
26217 IX86_BUILTIN_VTESTZPS,
26218 IX86_BUILTIN_VTESTCPS,
26219 IX86_BUILTIN_VTESTNZCPS,
26220 IX86_BUILTIN_VTESTZPD256,
26221 IX86_BUILTIN_VTESTCPD256,
26222 IX86_BUILTIN_VTESTNZCPD256,
26223 IX86_BUILTIN_VTESTZPS256,
26224 IX86_BUILTIN_VTESTCPS256,
26225 IX86_BUILTIN_VTESTNZCPS256,
26226 IX86_BUILTIN_PTESTZ256,
26227 IX86_BUILTIN_PTESTC256,
26228 IX86_BUILTIN_PTESTNZC256,
26230 IX86_BUILTIN_MOVMSKPD256,
26231 IX86_BUILTIN_MOVMSKPS256,
26233 /* AVX2 */
26234 IX86_BUILTIN_MPSADBW256,
26235 IX86_BUILTIN_PABSB256,
26236 IX86_BUILTIN_PABSW256,
26237 IX86_BUILTIN_PABSD256,
26238 IX86_BUILTIN_PACKSSDW256,
26239 IX86_BUILTIN_PACKSSWB256,
26240 IX86_BUILTIN_PACKUSDW256,
26241 IX86_BUILTIN_PACKUSWB256,
26242 IX86_BUILTIN_PADDB256,
26243 IX86_BUILTIN_PADDW256,
26244 IX86_BUILTIN_PADDD256,
26245 IX86_BUILTIN_PADDQ256,
26246 IX86_BUILTIN_PADDSB256,
26247 IX86_BUILTIN_PADDSW256,
26248 IX86_BUILTIN_PADDUSB256,
26249 IX86_BUILTIN_PADDUSW256,
26250 IX86_BUILTIN_PALIGNR256,
26251 IX86_BUILTIN_AND256I,
26252 IX86_BUILTIN_ANDNOT256I,
26253 IX86_BUILTIN_PAVGB256,
26254 IX86_BUILTIN_PAVGW256,
26255 IX86_BUILTIN_PBLENDVB256,
26256 IX86_BUILTIN_PBLENDVW256,
26257 IX86_BUILTIN_PCMPEQB256,
26258 IX86_BUILTIN_PCMPEQW256,
26259 IX86_BUILTIN_PCMPEQD256,
26260 IX86_BUILTIN_PCMPEQQ256,
26261 IX86_BUILTIN_PCMPGTB256,
26262 IX86_BUILTIN_PCMPGTW256,
26263 IX86_BUILTIN_PCMPGTD256,
26264 IX86_BUILTIN_PCMPGTQ256,
26265 IX86_BUILTIN_PHADDW256,
26266 IX86_BUILTIN_PHADDD256,
26267 IX86_BUILTIN_PHADDSW256,
26268 IX86_BUILTIN_PHSUBW256,
26269 IX86_BUILTIN_PHSUBD256,
26270 IX86_BUILTIN_PHSUBSW256,
26271 IX86_BUILTIN_PMADDUBSW256,
26272 IX86_BUILTIN_PMADDWD256,
26273 IX86_BUILTIN_PMAXSB256,
26274 IX86_BUILTIN_PMAXSW256,
26275 IX86_BUILTIN_PMAXSD256,
26276 IX86_BUILTIN_PMAXUB256,
26277 IX86_BUILTIN_PMAXUW256,
26278 IX86_BUILTIN_PMAXUD256,
26279 IX86_BUILTIN_PMINSB256,
26280 IX86_BUILTIN_PMINSW256,
26281 IX86_BUILTIN_PMINSD256,
26282 IX86_BUILTIN_PMINUB256,
26283 IX86_BUILTIN_PMINUW256,
26284 IX86_BUILTIN_PMINUD256,
26285 IX86_BUILTIN_PMOVMSKB256,
26286 IX86_BUILTIN_PMOVSXBW256,
26287 IX86_BUILTIN_PMOVSXBD256,
26288 IX86_BUILTIN_PMOVSXBQ256,
26289 IX86_BUILTIN_PMOVSXWD256,
26290 IX86_BUILTIN_PMOVSXWQ256,
26291 IX86_BUILTIN_PMOVSXDQ256,
26292 IX86_BUILTIN_PMOVZXBW256,
26293 IX86_BUILTIN_PMOVZXBD256,
26294 IX86_BUILTIN_PMOVZXBQ256,
26295 IX86_BUILTIN_PMOVZXWD256,
26296 IX86_BUILTIN_PMOVZXWQ256,
26297 IX86_BUILTIN_PMOVZXDQ256,
26298 IX86_BUILTIN_PMULDQ256,
26299 IX86_BUILTIN_PMULHRSW256,
26300 IX86_BUILTIN_PMULHUW256,
26301 IX86_BUILTIN_PMULHW256,
26302 IX86_BUILTIN_PMULLW256,
26303 IX86_BUILTIN_PMULLD256,
26304 IX86_BUILTIN_PMULUDQ256,
26305 IX86_BUILTIN_POR256,
26306 IX86_BUILTIN_PSADBW256,
26307 IX86_BUILTIN_PSHUFB256,
26308 IX86_BUILTIN_PSHUFD256,
26309 IX86_BUILTIN_PSHUFHW256,
26310 IX86_BUILTIN_PSHUFLW256,
26311 IX86_BUILTIN_PSIGNB256,
26312 IX86_BUILTIN_PSIGNW256,
26313 IX86_BUILTIN_PSIGND256,
26314 IX86_BUILTIN_PSLLDQI256,
26315 IX86_BUILTIN_PSLLWI256,
26316 IX86_BUILTIN_PSLLW256,
26317 IX86_BUILTIN_PSLLDI256,
26318 IX86_BUILTIN_PSLLD256,
26319 IX86_BUILTIN_PSLLQI256,
26320 IX86_BUILTIN_PSLLQ256,
26321 IX86_BUILTIN_PSRAWI256,
26322 IX86_BUILTIN_PSRAW256,
26323 IX86_BUILTIN_PSRADI256,
26324 IX86_BUILTIN_PSRAD256,
26325 IX86_BUILTIN_PSRLDQI256,
26326 IX86_BUILTIN_PSRLWI256,
26327 IX86_BUILTIN_PSRLW256,
26328 IX86_BUILTIN_PSRLDI256,
26329 IX86_BUILTIN_PSRLD256,
26330 IX86_BUILTIN_PSRLQI256,
26331 IX86_BUILTIN_PSRLQ256,
26332 IX86_BUILTIN_PSUBB256,
26333 IX86_BUILTIN_PSUBW256,
26334 IX86_BUILTIN_PSUBD256,
26335 IX86_BUILTIN_PSUBQ256,
26336 IX86_BUILTIN_PSUBSB256,
26337 IX86_BUILTIN_PSUBSW256,
26338 IX86_BUILTIN_PSUBUSB256,
26339 IX86_BUILTIN_PSUBUSW256,
26340 IX86_BUILTIN_PUNPCKHBW256,
26341 IX86_BUILTIN_PUNPCKHWD256,
26342 IX86_BUILTIN_PUNPCKHDQ256,
26343 IX86_BUILTIN_PUNPCKHQDQ256,
26344 IX86_BUILTIN_PUNPCKLBW256,
26345 IX86_BUILTIN_PUNPCKLWD256,
26346 IX86_BUILTIN_PUNPCKLDQ256,
26347 IX86_BUILTIN_PUNPCKLQDQ256,
26348 IX86_BUILTIN_PXOR256,
26349 IX86_BUILTIN_MOVNTDQA256,
26350 IX86_BUILTIN_VBROADCASTSS_PS,
26351 IX86_BUILTIN_VBROADCASTSS_PS256,
26352 IX86_BUILTIN_VBROADCASTSD_PD256,
26353 IX86_BUILTIN_VBROADCASTSI256,
26354 IX86_BUILTIN_PBLENDD256,
26355 IX86_BUILTIN_PBLENDD128,
26356 IX86_BUILTIN_PBROADCASTB256,
26357 IX86_BUILTIN_PBROADCASTW256,
26358 IX86_BUILTIN_PBROADCASTD256,
26359 IX86_BUILTIN_PBROADCASTQ256,
26360 IX86_BUILTIN_PBROADCASTB128,
26361 IX86_BUILTIN_PBROADCASTW128,
26362 IX86_BUILTIN_PBROADCASTD128,
26363 IX86_BUILTIN_PBROADCASTQ128,
26364 IX86_BUILTIN_VPERMVARSI256,
26365 IX86_BUILTIN_VPERMDF256,
26366 IX86_BUILTIN_VPERMVARSF256,
26367 IX86_BUILTIN_VPERMDI256,
26368 IX86_BUILTIN_VPERMTI256,
26369 IX86_BUILTIN_VEXTRACT128I256,
26370 IX86_BUILTIN_VINSERT128I256,
26371 IX86_BUILTIN_MASKLOADD,
26372 IX86_BUILTIN_MASKLOADQ,
26373 IX86_BUILTIN_MASKLOADD256,
26374 IX86_BUILTIN_MASKLOADQ256,
26375 IX86_BUILTIN_MASKSTORED,
26376 IX86_BUILTIN_MASKSTOREQ,
26377 IX86_BUILTIN_MASKSTORED256,
26378 IX86_BUILTIN_MASKSTOREQ256,
26379 IX86_BUILTIN_PSLLVV4DI,
26380 IX86_BUILTIN_PSLLVV2DI,
26381 IX86_BUILTIN_PSLLVV8SI,
26382 IX86_BUILTIN_PSLLVV4SI,
26383 IX86_BUILTIN_PSRAVV8SI,
26384 IX86_BUILTIN_PSRAVV4SI,
26385 IX86_BUILTIN_PSRLVV4DI,
26386 IX86_BUILTIN_PSRLVV2DI,
26387 IX86_BUILTIN_PSRLVV8SI,
26388 IX86_BUILTIN_PSRLVV4SI,
26390 IX86_BUILTIN_GATHERSIV2DF,
26391 IX86_BUILTIN_GATHERSIV4DF,
26392 IX86_BUILTIN_GATHERDIV2DF,
26393 IX86_BUILTIN_GATHERDIV4DF,
26394 IX86_BUILTIN_GATHERSIV4SF,
26395 IX86_BUILTIN_GATHERSIV8SF,
26396 IX86_BUILTIN_GATHERDIV4SF,
26397 IX86_BUILTIN_GATHERDIV8SF,
26398 IX86_BUILTIN_GATHERSIV2DI,
26399 IX86_BUILTIN_GATHERSIV4DI,
26400 IX86_BUILTIN_GATHERDIV2DI,
26401 IX86_BUILTIN_GATHERDIV4DI,
26402 IX86_BUILTIN_GATHERSIV4SI,
26403 IX86_BUILTIN_GATHERSIV8SI,
26404 IX86_BUILTIN_GATHERDIV4SI,
26405 IX86_BUILTIN_GATHERDIV8SI,
26407 /* Alternate 4 element gather for the vectorizer where
26408 all operands are 32-byte wide. */
26409 IX86_BUILTIN_GATHERALTSIV4DF,
26410 IX86_BUILTIN_GATHERALTDIV8SF,
26411 IX86_BUILTIN_GATHERALTSIV4DI,
26412 IX86_BUILTIN_GATHERALTDIV8SI,
26414 /* TFmode support builtins. */
26415 IX86_BUILTIN_INFQ,
26416 IX86_BUILTIN_HUGE_VALQ,
26417 IX86_BUILTIN_FABSQ,
26418 IX86_BUILTIN_COPYSIGNQ,
26420 /* Vectorizer support builtins. */
26421 IX86_BUILTIN_CPYSGNPS,
26422 IX86_BUILTIN_CPYSGNPD,
26423 IX86_BUILTIN_CPYSGNPS256,
26424 IX86_BUILTIN_CPYSGNPD256,
26426 /* FMA4 instructions. */
26427 IX86_BUILTIN_VFMADDSS,
26428 IX86_BUILTIN_VFMADDSD,
26429 IX86_BUILTIN_VFMADDPS,
26430 IX86_BUILTIN_VFMADDPD,
26431 IX86_BUILTIN_VFMADDPS256,
26432 IX86_BUILTIN_VFMADDPD256,
26433 IX86_BUILTIN_VFMADDSUBPS,
26434 IX86_BUILTIN_VFMADDSUBPD,
26435 IX86_BUILTIN_VFMADDSUBPS256,
26436 IX86_BUILTIN_VFMADDSUBPD256,
26438 /* FMA3 instructions. */
26439 IX86_BUILTIN_VFMADDSS3,
26440 IX86_BUILTIN_VFMADDSD3,
26442 /* XOP instructions. */
26443 IX86_BUILTIN_VPCMOV,
26444 IX86_BUILTIN_VPCMOV_V2DI,
26445 IX86_BUILTIN_VPCMOV_V4SI,
26446 IX86_BUILTIN_VPCMOV_V8HI,
26447 IX86_BUILTIN_VPCMOV_V16QI,
26448 IX86_BUILTIN_VPCMOV_V4SF,
26449 IX86_BUILTIN_VPCMOV_V2DF,
26450 IX86_BUILTIN_VPCMOV256,
26451 IX86_BUILTIN_VPCMOV_V4DI256,
26452 IX86_BUILTIN_VPCMOV_V8SI256,
26453 IX86_BUILTIN_VPCMOV_V16HI256,
26454 IX86_BUILTIN_VPCMOV_V32QI256,
26455 IX86_BUILTIN_VPCMOV_V8SF256,
26456 IX86_BUILTIN_VPCMOV_V4DF256,
26458 IX86_BUILTIN_VPPERM,
26460 IX86_BUILTIN_VPMACSSWW,
26461 IX86_BUILTIN_VPMACSWW,
26462 IX86_BUILTIN_VPMACSSWD,
26463 IX86_BUILTIN_VPMACSWD,
26464 IX86_BUILTIN_VPMACSSDD,
26465 IX86_BUILTIN_VPMACSDD,
26466 IX86_BUILTIN_VPMACSSDQL,
26467 IX86_BUILTIN_VPMACSSDQH,
26468 IX86_BUILTIN_VPMACSDQL,
26469 IX86_BUILTIN_VPMACSDQH,
26470 IX86_BUILTIN_VPMADCSSWD,
26471 IX86_BUILTIN_VPMADCSWD,
26473 IX86_BUILTIN_VPHADDBW,
26474 IX86_BUILTIN_VPHADDBD,
26475 IX86_BUILTIN_VPHADDBQ,
26476 IX86_BUILTIN_VPHADDWD,
26477 IX86_BUILTIN_VPHADDWQ,
26478 IX86_BUILTIN_VPHADDDQ,
26479 IX86_BUILTIN_VPHADDUBW,
26480 IX86_BUILTIN_VPHADDUBD,
26481 IX86_BUILTIN_VPHADDUBQ,
26482 IX86_BUILTIN_VPHADDUWD,
26483 IX86_BUILTIN_VPHADDUWQ,
26484 IX86_BUILTIN_VPHADDUDQ,
26485 IX86_BUILTIN_VPHSUBBW,
26486 IX86_BUILTIN_VPHSUBWD,
26487 IX86_BUILTIN_VPHSUBDQ,
26489 IX86_BUILTIN_VPROTB,
26490 IX86_BUILTIN_VPROTW,
26491 IX86_BUILTIN_VPROTD,
26492 IX86_BUILTIN_VPROTQ,
26493 IX86_BUILTIN_VPROTB_IMM,
26494 IX86_BUILTIN_VPROTW_IMM,
26495 IX86_BUILTIN_VPROTD_IMM,
26496 IX86_BUILTIN_VPROTQ_IMM,
26498 IX86_BUILTIN_VPSHLB,
26499 IX86_BUILTIN_VPSHLW,
26500 IX86_BUILTIN_VPSHLD,
26501 IX86_BUILTIN_VPSHLQ,
26502 IX86_BUILTIN_VPSHAB,
26503 IX86_BUILTIN_VPSHAW,
26504 IX86_BUILTIN_VPSHAD,
26505 IX86_BUILTIN_VPSHAQ,
26507 IX86_BUILTIN_VFRCZSS,
26508 IX86_BUILTIN_VFRCZSD,
26509 IX86_BUILTIN_VFRCZPS,
26510 IX86_BUILTIN_VFRCZPD,
26511 IX86_BUILTIN_VFRCZPS256,
26512 IX86_BUILTIN_VFRCZPD256,
26514 IX86_BUILTIN_VPCOMEQUB,
26515 IX86_BUILTIN_VPCOMNEUB,
26516 IX86_BUILTIN_VPCOMLTUB,
26517 IX86_BUILTIN_VPCOMLEUB,
26518 IX86_BUILTIN_VPCOMGTUB,
26519 IX86_BUILTIN_VPCOMGEUB,
26520 IX86_BUILTIN_VPCOMFALSEUB,
26521 IX86_BUILTIN_VPCOMTRUEUB,
26523 IX86_BUILTIN_VPCOMEQUW,
26524 IX86_BUILTIN_VPCOMNEUW,
26525 IX86_BUILTIN_VPCOMLTUW,
26526 IX86_BUILTIN_VPCOMLEUW,
26527 IX86_BUILTIN_VPCOMGTUW,
26528 IX86_BUILTIN_VPCOMGEUW,
26529 IX86_BUILTIN_VPCOMFALSEUW,
26530 IX86_BUILTIN_VPCOMTRUEUW,
26532 IX86_BUILTIN_VPCOMEQUD,
26533 IX86_BUILTIN_VPCOMNEUD,
26534 IX86_BUILTIN_VPCOMLTUD,
26535 IX86_BUILTIN_VPCOMLEUD,
26536 IX86_BUILTIN_VPCOMGTUD,
26537 IX86_BUILTIN_VPCOMGEUD,
26538 IX86_BUILTIN_VPCOMFALSEUD,
26539 IX86_BUILTIN_VPCOMTRUEUD,
26541 IX86_BUILTIN_VPCOMEQUQ,
26542 IX86_BUILTIN_VPCOMNEUQ,
26543 IX86_BUILTIN_VPCOMLTUQ,
26544 IX86_BUILTIN_VPCOMLEUQ,
26545 IX86_BUILTIN_VPCOMGTUQ,
26546 IX86_BUILTIN_VPCOMGEUQ,
26547 IX86_BUILTIN_VPCOMFALSEUQ,
26548 IX86_BUILTIN_VPCOMTRUEUQ,
26550 IX86_BUILTIN_VPCOMEQB,
26551 IX86_BUILTIN_VPCOMNEB,
26552 IX86_BUILTIN_VPCOMLTB,
26553 IX86_BUILTIN_VPCOMLEB,
26554 IX86_BUILTIN_VPCOMGTB,
26555 IX86_BUILTIN_VPCOMGEB,
26556 IX86_BUILTIN_VPCOMFALSEB,
26557 IX86_BUILTIN_VPCOMTRUEB,
26559 IX86_BUILTIN_VPCOMEQW,
26560 IX86_BUILTIN_VPCOMNEW,
26561 IX86_BUILTIN_VPCOMLTW,
26562 IX86_BUILTIN_VPCOMLEW,
26563 IX86_BUILTIN_VPCOMGTW,
26564 IX86_BUILTIN_VPCOMGEW,
26565 IX86_BUILTIN_VPCOMFALSEW,
26566 IX86_BUILTIN_VPCOMTRUEW,
26568 IX86_BUILTIN_VPCOMEQD,
26569 IX86_BUILTIN_VPCOMNED,
26570 IX86_BUILTIN_VPCOMLTD,
26571 IX86_BUILTIN_VPCOMLED,
26572 IX86_BUILTIN_VPCOMGTD,
26573 IX86_BUILTIN_VPCOMGED,
26574 IX86_BUILTIN_VPCOMFALSED,
26575 IX86_BUILTIN_VPCOMTRUED,
26577 IX86_BUILTIN_VPCOMEQQ,
26578 IX86_BUILTIN_VPCOMNEQ,
26579 IX86_BUILTIN_VPCOMLTQ,
26580 IX86_BUILTIN_VPCOMLEQ,
26581 IX86_BUILTIN_VPCOMGTQ,
26582 IX86_BUILTIN_VPCOMGEQ,
26583 IX86_BUILTIN_VPCOMFALSEQ,
26584 IX86_BUILTIN_VPCOMTRUEQ,
26586 /* LWP instructions. */
26587 IX86_BUILTIN_LLWPCB,
26588 IX86_BUILTIN_SLWPCB,
26589 IX86_BUILTIN_LWPVAL32,
26590 IX86_BUILTIN_LWPVAL64,
26591 IX86_BUILTIN_LWPINS32,
26592 IX86_BUILTIN_LWPINS64,
26594 IX86_BUILTIN_CLZS,
26596 /* RTM */
26597 IX86_BUILTIN_XBEGIN,
26598 IX86_BUILTIN_XEND,
26599 IX86_BUILTIN_XABORT,
26600 IX86_BUILTIN_XTEST,
26602 /* BMI instructions. */
26603 IX86_BUILTIN_BEXTR32,
26604 IX86_BUILTIN_BEXTR64,
26605 IX86_BUILTIN_CTZS,
26607 /* TBM instructions. */
26608 IX86_BUILTIN_BEXTRI32,
26609 IX86_BUILTIN_BEXTRI64,
26611 /* BMI2 instructions. */
26612 IX86_BUILTIN_BZHI32,
26613 IX86_BUILTIN_BZHI64,
26614 IX86_BUILTIN_PDEP32,
26615 IX86_BUILTIN_PDEP64,
26616 IX86_BUILTIN_PEXT32,
26617 IX86_BUILTIN_PEXT64,
26619 /* ADX instructions. */
26620 IX86_BUILTIN_ADDCARRYX32,
26621 IX86_BUILTIN_ADDCARRYX64,
26623 /* FSGSBASE instructions. */
26624 IX86_BUILTIN_RDFSBASE32,
26625 IX86_BUILTIN_RDFSBASE64,
26626 IX86_BUILTIN_RDGSBASE32,
26627 IX86_BUILTIN_RDGSBASE64,
26628 IX86_BUILTIN_WRFSBASE32,
26629 IX86_BUILTIN_WRFSBASE64,
26630 IX86_BUILTIN_WRGSBASE32,
26631 IX86_BUILTIN_WRGSBASE64,
26633 /* RDRND instructions. */
26634 IX86_BUILTIN_RDRAND16_STEP,
26635 IX86_BUILTIN_RDRAND32_STEP,
26636 IX86_BUILTIN_RDRAND64_STEP,
26638 /* RDSEED instructions. */
26639 IX86_BUILTIN_RDSEED16_STEP,
26640 IX86_BUILTIN_RDSEED32_STEP,
26641 IX86_BUILTIN_RDSEED64_STEP,
26643 /* F16C instructions. */
26644 IX86_BUILTIN_CVTPH2PS,
26645 IX86_BUILTIN_CVTPH2PS256,
26646 IX86_BUILTIN_CVTPS2PH,
26647 IX86_BUILTIN_CVTPS2PH256,
26649 /* CFString built-in for darwin */
26650 IX86_BUILTIN_CFSTRING,
26652 /* Builtins to get CPU type and supported features. */
26653 IX86_BUILTIN_CPU_INIT,
26654 IX86_BUILTIN_CPU_IS,
26655 IX86_BUILTIN_CPU_SUPPORTS,
26657 IX86_BUILTIN_MAX
26660 /* Table for the ix86 builtin decls. */
26661 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
26663 /* Table of all of the builtin functions that are possible with different ISA's
26664 but are waiting to be built until a function is declared to use that
26665 ISA. */
26666 struct builtin_isa {
26667 const char *name; /* function name */
26668 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
26669 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
26670 bool const_p; /* true if the declaration is constant */
26671 bool set_and_not_built_p;
26674 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
26677 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
26678 of which isa_flags to use in the ix86_builtins_isa array. Stores the
26679 function decl in the ix86_builtins array. Returns the function decl or
26680 NULL_TREE, if the builtin was not added.
26682 If the front end has a special hook for builtin functions, delay adding
26683 builtin functions that aren't in the current ISA until the ISA is changed
26684 with function specific optimization. Doing so, can save about 300K for the
26685 default compiler. When the builtin is expanded, check at that time whether
26686 it is valid.
26688 If the front end doesn't have a special hook, record all builtins, even if
26689 it isn't an instruction set in the current ISA in case the user uses
26690 function specific options for a different ISA, so that we don't get scope
26691 errors if a builtin is added in the middle of a function scope. */
26693 static inline tree
26694 def_builtin (HOST_WIDE_INT mask, const char *name,
26695 enum ix86_builtin_func_type tcode,
26696 enum ix86_builtins code)
26698 tree decl = NULL_TREE;
26700 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
26702 ix86_builtins_isa[(int) code].isa = mask;
26704 mask &= ~OPTION_MASK_ISA_64BIT;
26705 if (mask == 0
26706 || (mask & ix86_isa_flags) != 0
26707 || (lang_hooks.builtin_function
26708 == lang_hooks.builtin_function_ext_scope))
26711 tree type = ix86_get_builtin_func_type (tcode);
26712 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
26713 NULL, NULL_TREE);
26714 ix86_builtins[(int) code] = decl;
26715 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
26717 else
26719 ix86_builtins[(int) code] = NULL_TREE;
26720 ix86_builtins_isa[(int) code].tcode = tcode;
26721 ix86_builtins_isa[(int) code].name = name;
26722 ix86_builtins_isa[(int) code].const_p = false;
26723 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
26727 return decl;
26730 /* Like def_builtin, but also marks the function decl "const". */
26732 static inline tree
26733 def_builtin_const (HOST_WIDE_INT mask, const char *name,
26734 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
26736 tree decl = def_builtin (mask, name, tcode, code);
26737 if (decl)
26738 TREE_READONLY (decl) = 1;
26739 else
26740 ix86_builtins_isa[(int) code].const_p = true;
26742 return decl;
26745 /* Add any new builtin functions for a given ISA that may not have been
26746 declared. This saves a bit of space compared to adding all of the
26747 declarations to the tree, even if we didn't use them. */
26749 static void
26750 ix86_add_new_builtins (HOST_WIDE_INT isa)
26752 int i;
26754 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
26756 if ((ix86_builtins_isa[i].isa & isa) != 0
26757 && ix86_builtins_isa[i].set_and_not_built_p)
26759 tree decl, type;
26761 /* Don't define the builtin again. */
26762 ix86_builtins_isa[i].set_and_not_built_p = false;
26764 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
26765 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
26766 type, i, BUILT_IN_MD, NULL,
26767 NULL_TREE);
26769 ix86_builtins[i] = decl;
26770 if (ix86_builtins_isa[i].const_p)
26771 TREE_READONLY (decl) = 1;
26776 /* Bits for builtin_description.flag. */
26778 /* Set when we don't support the comparison natively, and should
26779 swap_comparison in order to support it. */
26780 #define BUILTIN_DESC_SWAP_OPERANDS 1
26782 struct builtin_description
26784 const HOST_WIDE_INT mask;
26785 const enum insn_code icode;
26786 const char *const name;
26787 const enum ix86_builtins code;
26788 const enum rtx_code comparison;
26789 const int flag;
26792 static const struct builtin_description bdesc_comi[] =
26794 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
26795 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
26796 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
26797 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
26798 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
26799 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
26800 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
26801 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
26802 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
26803 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
26804 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
26805 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
26806 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
26807 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
26808 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
26809 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
26810 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
26811 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
26812 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
26813 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
26814 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
26815 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
26816 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
26817 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
26820 static const struct builtin_description bdesc_pcmpestr[] =
26822 /* SSE4.2 */
26823 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
26824 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
26825 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
26826 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
26827 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
26828 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
26829 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
26832 static const struct builtin_description bdesc_pcmpistr[] =
26834 /* SSE4.2 */
26835 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
26836 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
26837 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
26838 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
26839 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
26840 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
26841 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
26844 /* Special builtins with variable number of arguments. */
26845 static const struct builtin_description bdesc_special_args[] =
26847 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
26848 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
26849 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
26851 /* MMX */
26852 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26854 /* 3DNow! */
26855 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26857 /* FXSR, XSAVE and XSAVEOPT */
26858 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxsave", IX86_BUILTIN_FXSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID },
26859 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxrstor", IX86_BUILTIN_FXRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID },
26860 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xsave", IX86_BUILTIN_XSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26861 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xrstor", IX86_BUILTIN_XRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26862 { OPTION_MASK_ISA_XSAVEOPT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt", IX86_BUILTIN_XSAVEOPT, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26864 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxsave64", IX86_BUILTIN_FXSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID },
26865 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxrstor64", IX86_BUILTIN_FXRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID },
26866 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsave64", IX86_BUILTIN_XSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26867 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xrstor64", IX86_BUILTIN_XRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26868 { OPTION_MASK_ISA_XSAVEOPT | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt64", IX86_BUILTIN_XSAVEOPT64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26870 /* SSE */
26871 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storeups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26872 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26873 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
26875 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
26876 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
26877 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
26878 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
26880 /* SSE or 3DNow!A */
26881 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26882 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
26884 /* SSE2 */
26885 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26886 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26887 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storeupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26888 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storedqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
26889 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26890 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
26891 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
26892 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
26893 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
26894 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loaddqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
26896 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
26897 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
26899 /* SSE3 */
26900 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
26902 /* SSE4.1 */
26903 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
26905 /* SSE4A */
26906 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26907 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26909 /* AVX */
26910 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
26911 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
26913 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
26914 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26915 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26916 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
26917 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
26919 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26920 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26921 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26922 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26923 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loaddqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26924 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storedqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
26925 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26927 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
26928 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26929 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26931 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
26932 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
26933 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
26934 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
26935 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
26936 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
26937 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
26938 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
26940 /* AVX2 */
26941 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
26942 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
26943 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
26944 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
26945 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
26946 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
26947 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
26948 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
26949 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
26951 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
26952 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
26953 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
26954 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
26955 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
26956 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
26958 /* FSGSBASE */
26959 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26960 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26961 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26962 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26963 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26964 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26965 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26966 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26968 /* RTM */
26969 { OPTION_MASK_ISA_RTM, CODE_FOR_xbegin, "__builtin_ia32_xbegin", IX86_BUILTIN_XBEGIN, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26970 { OPTION_MASK_ISA_RTM, CODE_FOR_xend, "__builtin_ia32_xend", IX86_BUILTIN_XEND, UNKNOWN, (int) VOID_FTYPE_VOID },
26971 { OPTION_MASK_ISA_RTM, CODE_FOR_xtest, "__builtin_ia32_xtest", IX86_BUILTIN_XTEST, UNKNOWN, (int) INT_FTYPE_VOID },
26974 /* Builtins with variable number of arguments. */
26975 static const struct builtin_description bdesc_args[] =
26977 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
26978 { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
26979 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
26980 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26981 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26982 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26983 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26985 /* MMX */
26986 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26987 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26988 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26989 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26990 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26991 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26993 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26994 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26995 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26996 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26997 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26998 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26999 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27000 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27002 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27003 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27005 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27006 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27007 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27008 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27010 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27011 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27012 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27013 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27014 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27015 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27017 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27018 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27019 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27020 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27021 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
27022 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
27024 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
27025 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
27026 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
27028 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
27030 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27031 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27032 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
27033 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27034 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27035 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
27037 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27038 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27039 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
27040 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27041 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27042 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
27044 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27045 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27046 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27047 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27049 /* 3DNow! */
27050 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
27051 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
27052 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27053 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27055 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27056 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27057 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27058 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27059 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27060 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27061 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27062 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27063 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27064 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27065 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27066 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27067 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27068 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27069 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27071 /* 3DNow!A */
27072 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
27073 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
27074 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
27075 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27076 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27077 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27079 /* SSE */
27080 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
27081 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27082 { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27083 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27084 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27085 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27086 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
27087 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
27088 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
27089 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
27090 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
27091 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
27093 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27095 { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27096 { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27097 { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27098 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27099 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27100 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27101 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27102 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27104 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
27105 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
27106 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
27107 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27108 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27109 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27110 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
27111 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
27112 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
27113 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27114 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
27115 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27116 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
27117 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
27118 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
27119 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27120 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
27121 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
27122 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
27123 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngtss", IX86_BUILTIN_CMPNGTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27124 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngess", IX86_BUILTIN_CMPNGESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27125 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27127 { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27128 { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27129 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27130 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27132 { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27133 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27134 { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27135 { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27137 { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27139 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27140 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27141 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27142 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27143 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27145 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
27146 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
27147 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
27149 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
27151 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27152 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27153 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27155 { OPTION_MASK_ISA_SSE, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
27156 { OPTION_MASK_ISA_SSE, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
27158 /* SSE MMX or 3Dnow!A */
27159 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27160 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27161 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27163 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27164 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27165 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27166 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27168 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
27169 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
27171 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
27173 /* SSE2 */
27174 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27176 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
27177 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
27178 { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
27179 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
27180 { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
27182 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
27183 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
27184 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
27185 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
27186 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
27188 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
27190 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
27191 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
27192 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
27193 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
27195 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2dq, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27196 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
27197 { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27199 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27200 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27201 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27202 { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27203 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27204 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27205 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27206 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27208 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
27209 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
27210 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
27211 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27212 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
27213 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27214 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
27215 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
27216 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
27217 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27218 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27219 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27220 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
27221 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
27222 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
27223 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27224 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
27225 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
27226 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
27227 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27229 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27230 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27231 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27232 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27234 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27235 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27236 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27237 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27239 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27241 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27242 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27243 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27245 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
27247 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27248 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27249 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27250 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27251 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27252 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27253 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27254 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27256 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27257 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27258 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27259 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27260 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27261 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27262 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27263 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27265 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27266 { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
27268 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27269 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27270 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27271 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27273 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27274 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27276 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27277 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27278 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27279 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27280 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27281 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27283 { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27284 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27285 { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27286 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27288 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27289 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27290 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27291 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27292 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27293 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27294 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27295 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27297 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
27298 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
27299 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
27301 { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27302 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
27304 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
27305 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_widen_umult_even_v4si, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
27307 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
27309 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
27310 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
27311 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
27312 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
27314 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
27315 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27316 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27317 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
27318 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27319 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27320 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
27322 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
27323 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27324 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27325 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
27326 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27327 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27328 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
27330 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27331 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27332 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27333 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27335 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
27336 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
27337 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
27339 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
27341 { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27343 /* SSE2 MMX */
27344 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
27345 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
27347 /* SSE3 */
27348 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
27349 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27351 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27352 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27353 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27354 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27355 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27356 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27358 /* SSSE3 */
27359 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
27360 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
27361 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27362 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
27363 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
27364 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
27366 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27367 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27368 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27369 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27370 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27371 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27372 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27373 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27374 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27375 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27376 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27377 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27378 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
27379 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
27380 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27381 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27382 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27383 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27384 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27385 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27386 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27387 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27388 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27389 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27391 /* SSSE3. */
27392 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
27393 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
27395 /* SSE4.1 */
27396 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27397 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27398 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
27399 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
27400 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27401 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27402 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27403 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
27404 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
27405 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
27407 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
27408 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
27409 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
27410 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
27411 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
27412 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
27413 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
27414 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
27415 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
27416 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
27417 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
27418 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
27419 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27421 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
27422 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27423 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27424 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27425 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27426 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27427 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27428 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27429 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27430 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27431 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
27432 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27434 /* SSE4.1 */
27435 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
27436 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
27437 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27438 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27440 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
27441 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
27442 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
27443 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
27445 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
27446 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
27448 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
27449 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
27451 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
27452 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
27453 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
27454 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
27456 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND },
27457 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND },
27459 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27460 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27462 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27463 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27464 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27466 /* SSE4.2 */
27467 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27468 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
27469 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
27470 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27471 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27473 /* SSE4A */
27474 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
27475 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
27476 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
27477 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27479 /* AES */
27480 { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
27481 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27483 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27484 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27485 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27486 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27488 /* PCLMUL */
27489 { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
27491 /* AVX */
27492 { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27493 { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27494 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27495 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27496 { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27497 { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27498 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27499 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27500 { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27501 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27502 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27503 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27504 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27505 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27506 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27507 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27508 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27509 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27510 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27511 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27512 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27513 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27514 { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27515 { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27516 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27517 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27519 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
27520 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
27521 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
27522 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
27524 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27525 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27526 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
27527 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
27528 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27529 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27530 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27531 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27532 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27533 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27534 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27535 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27536 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27537 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
27538 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
27539 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
27540 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
27541 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
27542 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
27543 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2dq256, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27544 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
27545 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
27546 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
27547 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27548 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27549 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27550 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
27551 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
27552 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
27553 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27554 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
27555 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
27556 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
27557 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
27559 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27560 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27561 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27563 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27564 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27565 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27566 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27567 { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27569 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27571 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27572 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
27574 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
27575 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
27576 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
27577 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
27579 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27580 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
27582 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
27583 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
27585 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
27586 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
27587 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
27588 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
27590 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND },
27591 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
27593 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27594 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27596 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27597 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27598 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27599 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27601 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
27602 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
27603 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
27604 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
27605 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
27606 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
27608 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27609 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27610 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27611 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27612 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27613 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27614 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27615 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27616 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27617 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27618 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27619 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27620 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27621 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27622 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27624 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
27625 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
27627 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27628 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27630 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
27632 /* AVX2 */
27633 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
27634 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
27635 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
27636 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
27637 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256", IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
27638 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256", IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
27639 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256", IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
27640 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256", IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
27641 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27642 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27643 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27644 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27645 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27646 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27647 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27648 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27649 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
27650 { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27651 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27652 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27653 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27654 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
27655 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
27656 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27657 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27658 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27659 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27660 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27661 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27662 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27663 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27664 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27665 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27666 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27667 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27668 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27669 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27670 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
27671 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
27672 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27673 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27674 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27675 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27676 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27677 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27678 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27679 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27680 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27681 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27682 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27683 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27684 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
27685 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
27686 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2 , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
27687 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2 , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
27688 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2 , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
27689 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2 , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
27690 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2 , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
27691 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
27692 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2 , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
27693 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2 , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
27694 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
27695 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
27696 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
27697 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_smult_even_v8si, "__builtin_ia32_pmuldq256", IX86_BUILTIN_PMULDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
27698 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27699 { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27700 { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27701 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27702 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27703 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_umult_even_v8si, "__builtin_ia32_pmuludq256", IX86_BUILTIN_PMULUDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
27704 { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27705 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
27706 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27707 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
27708 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
27709 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
27710 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27711 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27712 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27713 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
27714 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27715 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27716 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27717 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27718 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
27719 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
27720 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27721 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27722 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27723 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27724 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
27725 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27726 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27727 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27728 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27729 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
27730 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
27731 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27732 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27733 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27734 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27735 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27736 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27737 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27738 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27739 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27740 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27741 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27742 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27743 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27744 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27745 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27746 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27747 { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27748 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27749 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
27750 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
27751 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
27752 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
27753 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
27754 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
27755 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
27756 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
27757 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
27758 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
27759 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27760 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
27761 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27762 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27763 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
27764 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27765 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
27766 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
27767 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
27768 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
27769 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27770 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27771 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27772 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27773 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27774 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27775 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27776 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27777 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27778 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27780 { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
27782 /* BMI */
27783 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27784 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27785 { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
27787 /* TBM */
27788 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27789 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27791 /* F16C */
27792 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
27793 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
27794 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
27795 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
27797 /* BMI2 */
27798 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27799 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27800 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27801 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27802 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27803 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27806 /* FMA4 and XOP. */
27807 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
27808 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
27809 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
27810 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
27811 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
27812 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
27813 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
27814 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
27815 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
27816 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
27817 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
27818 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
27819 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
27820 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
27821 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
27822 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
27823 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
27824 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
27825 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
27826 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
27827 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
27828 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
27829 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
27830 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
27831 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
27832 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
27833 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
27834 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
27835 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
27836 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
27837 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
27838 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
27839 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
27840 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
27841 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
27842 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
27843 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
27844 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
27845 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
27846 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
27847 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
27848 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
27849 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
27850 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
27851 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
27852 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
27853 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
27854 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
27855 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
27856 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
27857 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
27858 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
27860 static const struct builtin_description bdesc_multi_arg[] =
27862 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
27863 "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
27864 UNKNOWN, (int)MULTI_ARG_3_SF },
27865 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
27866 "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
27867 UNKNOWN, (int)MULTI_ARG_3_DF },
27869 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
27870 "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
27871 UNKNOWN, (int)MULTI_ARG_3_SF },
27872 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
27873 "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
27874 UNKNOWN, (int)MULTI_ARG_3_DF },
27876 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
27877 "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
27878 UNKNOWN, (int)MULTI_ARG_3_SF },
27879 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
27880 "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
27881 UNKNOWN, (int)MULTI_ARG_3_DF },
27882 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
27883 "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
27884 UNKNOWN, (int)MULTI_ARG_3_SF2 },
27885 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
27886 "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
27887 UNKNOWN, (int)MULTI_ARG_3_DF2 },
27889 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
27890 "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
27891 UNKNOWN, (int)MULTI_ARG_3_SF },
27892 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
27893 "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
27894 UNKNOWN, (int)MULTI_ARG_3_DF },
27895 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
27896 "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
27897 UNKNOWN, (int)MULTI_ARG_3_SF2 },
27898 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
27899 "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
27900 UNKNOWN, (int)MULTI_ARG_3_DF2 },
27902 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI },
27903 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI },
27904 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI },
27905 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI },
27906 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI },
27907 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF },
27908 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF },
27910 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
27911 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
27912 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 },
27913 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 },
27914 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 },
27915 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
27916 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
27918 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI },
27920 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
27921 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
27922 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27923 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27924 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27925 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27926 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27927 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27928 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27929 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27930 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27931 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27933 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27934 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI },
27935 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI },
27936 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI },
27937 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM },
27938 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM },
27939 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM },
27940 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM },
27941 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27942 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI },
27943 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI },
27944 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI },
27945 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27946 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI },
27947 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI },
27948 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI },
27950 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_2_SF },
27951 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_2_DF },
27952 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF },
27953 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF },
27954 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 },
27955 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 },
27957 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27958 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27959 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27960 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27961 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27962 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27963 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27964 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27965 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27966 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27967 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27968 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27969 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27970 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27971 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27973 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP },
27974 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27975 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27976 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP },
27977 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP },
27978 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP },
27979 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP },
27981 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP },
27982 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
27983 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
27984 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP },
27985 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP },
27986 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP },
27987 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP },
27989 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP },
27990 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
27991 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
27992 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP },
27993 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP },
27994 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP },
27995 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP },
27997 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP },
27998 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
27999 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
28000 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP },
28001 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP },
28002 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP },
28003 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP },
28005 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP },
28006 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
28007 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
28008 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP },
28009 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP },
28010 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP },
28011 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP },
28013 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP },
28014 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
28015 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
28016 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP },
28017 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP },
28018 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP },
28019 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP },
28021 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP },
28022 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
28023 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
28024 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP },
28025 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP },
28026 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP },
28027 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP },
28029 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP },
28030 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
28031 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
28032 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP },
28033 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP },
28034 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP },
28035 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP },
28037 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
28038 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
28039 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
28040 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
28041 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
28042 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
28043 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
28044 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
28046 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
28047 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
28048 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
28049 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
28050 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
28051 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
28052 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
28053 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
28055 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
28056 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
28057 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
28058 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
28062 /* TM vector builtins. */
28064 /* Reuse the existing x86-specific `struct builtin_description' cause
28065 we're lazy. Add casts to make them fit. */
28066 static const struct builtin_description bdesc_tm[] =
28068 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28069 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28070 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28071 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28072 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28073 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28074 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28076 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28077 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28078 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28079 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28080 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28081 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28082 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28084 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28085 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28086 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28087 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28088 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28089 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28090 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28092 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
28093 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
28094 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
28097 /* TM callbacks. */
28099 /* Return the builtin decl needed to load a vector of TYPE. */
28101 static tree
28102 ix86_builtin_tm_load (tree type)
28104 if (TREE_CODE (type) == VECTOR_TYPE)
28106 switch (tree_low_cst (TYPE_SIZE (type), 1))
28108 case 64:
28109 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
28110 case 128:
28111 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
28112 case 256:
28113 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
28116 return NULL_TREE;
28119 /* Return the builtin decl needed to store a vector of TYPE. */
28121 static tree
28122 ix86_builtin_tm_store (tree type)
28124 if (TREE_CODE (type) == VECTOR_TYPE)
28126 switch (tree_low_cst (TYPE_SIZE (type), 1))
28128 case 64:
28129 return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
28130 case 128:
28131 return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
28132 case 256:
28133 return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
28136 return NULL_TREE;
28139 /* Initialize the transactional memory vector load/store builtins. */
28141 static void
28142 ix86_init_tm_builtins (void)
28144 enum ix86_builtin_func_type ftype;
28145 const struct builtin_description *d;
28146 size_t i;
28147 tree decl;
28148 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
28149 tree attrs_log, attrs_type_log;
28151 if (!flag_tm)
28152 return;
28154 /* If there are no builtins defined, we must be compiling in a
28155 language without trans-mem support. */
28156 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
28157 return;
28159 /* Use whatever attributes a normal TM load has. */
28160 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
28161 attrs_load = DECL_ATTRIBUTES (decl);
28162 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28163 /* Use whatever attributes a normal TM store has. */
28164 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
28165 attrs_store = DECL_ATTRIBUTES (decl);
28166 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28167 /* Use whatever attributes a normal TM log has. */
28168 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
28169 attrs_log = DECL_ATTRIBUTES (decl);
28170 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28172 for (i = 0, d = bdesc_tm;
28173 i < ARRAY_SIZE (bdesc_tm);
28174 i++, d++)
28176 if ((d->mask & ix86_isa_flags) != 0
28177 || (lang_hooks.builtin_function
28178 == lang_hooks.builtin_function_ext_scope))
28180 tree type, attrs, attrs_type;
28181 enum built_in_function code = (enum built_in_function) d->code;
28183 ftype = (enum ix86_builtin_func_type) d->flag;
28184 type = ix86_get_builtin_func_type (ftype);
28186 if (BUILTIN_TM_LOAD_P (code))
28188 attrs = attrs_load;
28189 attrs_type = attrs_type_load;
28191 else if (BUILTIN_TM_STORE_P (code))
28193 attrs = attrs_store;
28194 attrs_type = attrs_type_store;
28196 else
28198 attrs = attrs_log;
28199 attrs_type = attrs_type_log;
28201 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
28202 /* The builtin without the prefix for
28203 calling it directly. */
28204 d->name + strlen ("__builtin_"),
28205 attrs);
28206 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
28207 set the TYPE_ATTRIBUTES. */
28208 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
28210 set_builtin_decl (code, decl, false);
28215 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
28216 in the current target ISA to allow the user to compile particular modules
28217 with different target specific options that differ from the command line
28218 options. */
28219 static void
28220 ix86_init_mmx_sse_builtins (void)
28222 const struct builtin_description * d;
28223 enum ix86_builtin_func_type ftype;
28224 size_t i;
28226 /* Add all special builtins with variable number of operands. */
28227 for (i = 0, d = bdesc_special_args;
28228 i < ARRAY_SIZE (bdesc_special_args);
28229 i++, d++)
28231 if (d->name == 0)
28232 continue;
28234 ftype = (enum ix86_builtin_func_type) d->flag;
28235 def_builtin (d->mask, d->name, ftype, d->code);
28238 /* Add all builtins with variable number of operands. */
28239 for (i = 0, d = bdesc_args;
28240 i < ARRAY_SIZE (bdesc_args);
28241 i++, d++)
28243 if (d->name == 0)
28244 continue;
28246 ftype = (enum ix86_builtin_func_type) d->flag;
28247 def_builtin_const (d->mask, d->name, ftype, d->code);
28250 /* pcmpestr[im] insns. */
28251 for (i = 0, d = bdesc_pcmpestr;
28252 i < ARRAY_SIZE (bdesc_pcmpestr);
28253 i++, d++)
28255 if (d->code == IX86_BUILTIN_PCMPESTRM128)
28256 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
28257 else
28258 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
28259 def_builtin_const (d->mask, d->name, ftype, d->code);
28262 /* pcmpistr[im] insns. */
28263 for (i = 0, d = bdesc_pcmpistr;
28264 i < ARRAY_SIZE (bdesc_pcmpistr);
28265 i++, d++)
28267 if (d->code == IX86_BUILTIN_PCMPISTRM128)
28268 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
28269 else
28270 ftype = INT_FTYPE_V16QI_V16QI_INT;
28271 def_builtin_const (d->mask, d->name, ftype, d->code);
28274 /* comi/ucomi insns. */
28275 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
28277 if (d->mask == OPTION_MASK_ISA_SSE2)
28278 ftype = INT_FTYPE_V2DF_V2DF;
28279 else
28280 ftype = INT_FTYPE_V4SF_V4SF;
28281 def_builtin_const (d->mask, d->name, ftype, d->code);
28284 /* SSE */
28285 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
28286 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
28287 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
28288 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
28290 /* SSE or 3DNow!A */
28291 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28292 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
28293 IX86_BUILTIN_MASKMOVQ);
28295 /* SSE2 */
28296 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
28297 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
28299 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
28300 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
28301 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
28302 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
28304 /* SSE3. */
28305 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
28306 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
28307 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
28308 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
28310 /* AES */
28311 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
28312 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
28313 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
28314 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
28315 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
28316 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
28317 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
28318 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
28319 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
28320 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
28321 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
28322 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
28324 /* PCLMUL */
28325 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
28326 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
28328 /* RDRND */
28329 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
28330 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
28331 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
28332 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
28333 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
28334 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
28335 IX86_BUILTIN_RDRAND64_STEP);
28337 /* AVX2 */
28338 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
28339 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
28340 IX86_BUILTIN_GATHERSIV2DF);
28342 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
28343 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
28344 IX86_BUILTIN_GATHERSIV4DF);
28346 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
28347 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
28348 IX86_BUILTIN_GATHERDIV2DF);
28350 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
28351 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
28352 IX86_BUILTIN_GATHERDIV4DF);
28354 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
28355 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
28356 IX86_BUILTIN_GATHERSIV4SF);
28358 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
28359 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
28360 IX86_BUILTIN_GATHERSIV8SF);
28362 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
28363 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
28364 IX86_BUILTIN_GATHERDIV4SF);
28366 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
28367 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
28368 IX86_BUILTIN_GATHERDIV8SF);
28370 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
28371 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
28372 IX86_BUILTIN_GATHERSIV2DI);
28374 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
28375 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
28376 IX86_BUILTIN_GATHERSIV4DI);
28378 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
28379 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
28380 IX86_BUILTIN_GATHERDIV2DI);
28382 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
28383 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
28384 IX86_BUILTIN_GATHERDIV4DI);
28386 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
28387 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
28388 IX86_BUILTIN_GATHERSIV4SI);
28390 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
28391 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
28392 IX86_BUILTIN_GATHERSIV8SI);
28394 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
28395 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
28396 IX86_BUILTIN_GATHERDIV4SI);
28398 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
28399 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
28400 IX86_BUILTIN_GATHERDIV8SI);
28402 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
28403 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
28404 IX86_BUILTIN_GATHERALTSIV4DF);
28406 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
28407 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
28408 IX86_BUILTIN_GATHERALTDIV8SF);
28410 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
28411 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
28412 IX86_BUILTIN_GATHERALTSIV4DI);
28414 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
28415 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
28416 IX86_BUILTIN_GATHERALTDIV8SI);
28418 /* RTM. */
28419 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
28420 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
28422 /* MMX access to the vec_init patterns. */
28423 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
28424 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
28426 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
28427 V4HI_FTYPE_HI_HI_HI_HI,
28428 IX86_BUILTIN_VEC_INIT_V4HI);
28430 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
28431 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
28432 IX86_BUILTIN_VEC_INIT_V8QI);
28434 /* Access to the vec_extract patterns. */
28435 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
28436 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
28437 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
28438 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
28439 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
28440 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
28441 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
28442 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
28443 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
28444 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
28446 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28447 "__builtin_ia32_vec_ext_v4hi",
28448 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
28450 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
28451 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
28453 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
28454 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
28456 /* Access to the vec_set patterns. */
28457 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
28458 "__builtin_ia32_vec_set_v2di",
28459 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
28461 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
28462 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
28464 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
28465 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
28467 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
28468 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
28470 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28471 "__builtin_ia32_vec_set_v4hi",
28472 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
28474 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
28475 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
28477 /* RDSEED */
28478 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
28479 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
28480 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
28481 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
28482 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
28483 "__builtin_ia32_rdseed_di_step",
28484 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
28486 /* ADCX */
28487 def_builtin (0, "__builtin_ia32_addcarryx_u32",
28488 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
28489 def_builtin (OPTION_MASK_ISA_64BIT,
28490 "__builtin_ia32_addcarryx_u64",
28491 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
28492 IX86_BUILTIN_ADDCARRYX64);
28494 /* Add FMA4 multi-arg argument instructions */
28495 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
28497 if (d->name == 0)
28498 continue;
28500 ftype = (enum ix86_builtin_func_type) d->flag;
28501 def_builtin_const (d->mask, d->name, ftype, d->code);
28505 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
28506 to return a pointer to VERSION_DECL if the outcome of the expression
28507 formed by PREDICATE_CHAIN is true. This function will be called during
28508 version dispatch to decide which function version to execute. It returns
28509 the basic block at the end, to which more conditions can be added. */
28511 static basic_block
28512 add_condition_to_bb (tree function_decl, tree version_decl,
28513 tree predicate_chain, basic_block new_bb)
28515 gimple return_stmt;
28516 tree convert_expr, result_var;
28517 gimple convert_stmt;
28518 gimple call_cond_stmt;
28519 gimple if_else_stmt;
28521 basic_block bb1, bb2, bb3;
28522 edge e12, e23;
28524 tree cond_var, and_expr_var = NULL_TREE;
28525 gimple_seq gseq;
28527 tree predicate_decl, predicate_arg;
28529 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
28531 gcc_assert (new_bb != NULL);
28532 gseq = bb_seq (new_bb);
28535 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
28536 build_fold_addr_expr (version_decl));
28537 result_var = create_tmp_var (ptr_type_node, NULL);
28538 convert_stmt = gimple_build_assign (result_var, convert_expr);
28539 return_stmt = gimple_build_return (result_var);
28541 if (predicate_chain == NULL_TREE)
28543 gimple_seq_add_stmt (&gseq, convert_stmt);
28544 gimple_seq_add_stmt (&gseq, return_stmt);
28545 set_bb_seq (new_bb, gseq);
28546 gimple_set_bb (convert_stmt, new_bb);
28547 gimple_set_bb (return_stmt, new_bb);
28548 pop_cfun ();
28549 return new_bb;
28552 while (predicate_chain != NULL)
28554 cond_var = create_tmp_var (integer_type_node, NULL);
28555 predicate_decl = TREE_PURPOSE (predicate_chain);
28556 predicate_arg = TREE_VALUE (predicate_chain);
28557 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
28558 gimple_call_set_lhs (call_cond_stmt, cond_var);
28560 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
28561 gimple_set_bb (call_cond_stmt, new_bb);
28562 gimple_seq_add_stmt (&gseq, call_cond_stmt);
28564 predicate_chain = TREE_CHAIN (predicate_chain);
28566 if (and_expr_var == NULL)
28567 and_expr_var = cond_var;
28568 else
28570 gimple assign_stmt;
28571 /* Use MIN_EXPR to check if any integer is zero?.
28572 and_expr_var = min_expr <cond_var, and_expr_var> */
28573 assign_stmt = gimple_build_assign (and_expr_var,
28574 build2 (MIN_EXPR, integer_type_node,
28575 cond_var, and_expr_var));
28577 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
28578 gimple_set_bb (assign_stmt, new_bb);
28579 gimple_seq_add_stmt (&gseq, assign_stmt);
28583 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
28584 integer_zero_node,
28585 NULL_TREE, NULL_TREE);
28586 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
28587 gimple_set_bb (if_else_stmt, new_bb);
28588 gimple_seq_add_stmt (&gseq, if_else_stmt);
28590 gimple_seq_add_stmt (&gseq, convert_stmt);
28591 gimple_seq_add_stmt (&gseq, return_stmt);
28592 set_bb_seq (new_bb, gseq);
28594 bb1 = new_bb;
28595 e12 = split_block (bb1, if_else_stmt);
28596 bb2 = e12->dest;
28597 e12->flags &= ~EDGE_FALLTHRU;
28598 e12->flags |= EDGE_TRUE_VALUE;
28600 e23 = split_block (bb2, return_stmt);
28602 gimple_set_bb (convert_stmt, bb2);
28603 gimple_set_bb (return_stmt, bb2);
28605 bb3 = e23->dest;
28606 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
28608 remove_edge (e23);
28609 make_edge (bb2, EXIT_BLOCK_PTR, 0);
28611 pop_cfun ();
28613 return bb3;
28616 /* This parses the attribute arguments to target in DECL and determines
28617 the right builtin to use to match the platform specification.
28618 It returns the priority value for this version decl. If PREDICATE_LIST
28619 is not NULL, it stores the list of cpu features that need to be checked
28620 before dispatching this function. */
28622 static unsigned int
28623 get_builtin_code_for_version (tree decl, tree *predicate_list)
28625 tree attrs;
28626 struct cl_target_option cur_target;
28627 tree target_node;
28628 struct cl_target_option *new_target;
28629 const char *arg_str = NULL;
28630 const char *attrs_str = NULL;
28631 char *tok_str = NULL;
28632 char *token;
28634 /* Priority of i386 features, greater value is higher priority. This is
28635 used to decide the order in which function dispatch must happen. For
28636 instance, a version specialized for SSE4.2 should be checked for dispatch
28637 before a version for SSE3, as SSE4.2 implies SSE3. */
28638 enum feature_priority
28640 P_ZERO = 0,
28641 P_MMX,
28642 P_SSE,
28643 P_SSE2,
28644 P_SSE3,
28645 P_SSSE3,
28646 P_PROC_SSSE3,
28647 P_SSE4_a,
28648 P_PROC_SSE4_a,
28649 P_SSE4_1,
28650 P_SSE4_2,
28651 P_PROC_SSE4_2,
28652 P_POPCNT,
28653 P_AVX,
28654 P_AVX2,
28655 P_FMA,
28656 P_PROC_FMA
28659 enum feature_priority priority = P_ZERO;
28661 /* These are the target attribute strings for which a dispatcher is
28662 available, from fold_builtin_cpu. */
28664 static struct _feature_list
28666 const char *const name;
28667 const enum feature_priority priority;
28669 const feature_list[] =
28671 {"mmx", P_MMX},
28672 {"sse", P_SSE},
28673 {"sse2", P_SSE2},
28674 {"sse3", P_SSE3},
28675 {"ssse3", P_SSSE3},
28676 {"sse4.1", P_SSE4_1},
28677 {"sse4.2", P_SSE4_2},
28678 {"popcnt", P_POPCNT},
28679 {"avx", P_AVX},
28680 {"avx2", P_AVX2}
28684 static unsigned int NUM_FEATURES
28685 = sizeof (feature_list) / sizeof (struct _feature_list);
28687 unsigned int i;
28689 tree predicate_chain = NULL_TREE;
28690 tree predicate_decl, predicate_arg;
28692 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
28693 gcc_assert (attrs != NULL);
28695 attrs = TREE_VALUE (TREE_VALUE (attrs));
28697 gcc_assert (TREE_CODE (attrs) == STRING_CST);
28698 attrs_str = TREE_STRING_POINTER (attrs);
28700 /* Return priority zero for default function. */
28701 if (strcmp (attrs_str, "default") == 0)
28702 return 0;
28704 /* Handle arch= if specified. For priority, set it to be 1 more than
28705 the best instruction set the processor can handle. For instance, if
28706 there is a version for atom and a version for ssse3 (the highest ISA
28707 priority for atom), the atom version must be checked for dispatch
28708 before the ssse3 version. */
28709 if (strstr (attrs_str, "arch=") != NULL)
28711 cl_target_option_save (&cur_target, &global_options);
28712 target_node = ix86_valid_target_attribute_tree (attrs);
28714 gcc_assert (target_node);
28715 new_target = TREE_TARGET_OPTION (target_node);
28716 gcc_assert (new_target);
28718 if (new_target->arch_specified && new_target->arch > 0)
28720 switch (new_target->arch)
28722 case PROCESSOR_CORE2:
28723 arg_str = "core2";
28724 priority = P_PROC_SSSE3;
28725 break;
28726 case PROCESSOR_COREI7:
28727 arg_str = "corei7";
28728 priority = P_PROC_SSE4_2;
28729 break;
28730 case PROCESSOR_ATOM:
28731 arg_str = "atom";
28732 priority = P_PROC_SSSE3;
28733 break;
28734 case PROCESSOR_AMDFAM10:
28735 arg_str = "amdfam10h";
28736 priority = P_PROC_SSE4_a;
28737 break;
28738 case PROCESSOR_BDVER1:
28739 arg_str = "bdver1";
28740 priority = P_PROC_FMA;
28741 break;
28742 case PROCESSOR_BDVER2:
28743 arg_str = "bdver2";
28744 priority = P_PROC_FMA;
28745 break;
28749 cl_target_option_restore (&global_options, &cur_target);
28751 if (predicate_list && arg_str == NULL)
28753 error_at (DECL_SOURCE_LOCATION (decl),
28754 "No dispatcher found for the versioning attributes");
28755 return 0;
28758 if (predicate_list)
28760 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
28761 /* For a C string literal the length includes the trailing NULL. */
28762 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
28763 predicate_chain = tree_cons (predicate_decl, predicate_arg,
28764 predicate_chain);
28768 /* Process feature name. */
28769 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
28770 strcpy (tok_str, attrs_str);
28771 token = strtok (tok_str, ",");
28772 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
28774 while (token != NULL)
28776 /* Do not process "arch=" */
28777 if (strncmp (token, "arch=", 5) == 0)
28779 token = strtok (NULL, ",");
28780 continue;
28782 for (i = 0; i < NUM_FEATURES; ++i)
28784 if (strcmp (token, feature_list[i].name) == 0)
28786 if (predicate_list)
28788 predicate_arg = build_string_literal (
28789 strlen (feature_list[i].name) + 1,
28790 feature_list[i].name);
28791 predicate_chain = tree_cons (predicate_decl, predicate_arg,
28792 predicate_chain);
28794 /* Find the maximum priority feature. */
28795 if (feature_list[i].priority > priority)
28796 priority = feature_list[i].priority;
28798 break;
28801 if (predicate_list && i == NUM_FEATURES)
28803 error_at (DECL_SOURCE_LOCATION (decl),
28804 "No dispatcher found for %s", token);
28805 return 0;
28807 token = strtok (NULL, ",");
28809 free (tok_str);
28811 if (predicate_list && predicate_chain == NULL_TREE)
28813 error_at (DECL_SOURCE_LOCATION (decl),
28814 "No dispatcher found for the versioning attributes : %s",
28815 attrs_str);
28816 return 0;
28818 else if (predicate_list)
28820 predicate_chain = nreverse (predicate_chain);
28821 *predicate_list = predicate_chain;
28824 return priority;
28827 /* This compares the priority of target features in function DECL1
28828 and DECL2. It returns positive value if DECL1 is higher priority,
28829 negative value if DECL2 is higher priority and 0 if they are the
28830 same. */
28832 static int
28833 ix86_compare_version_priority (tree decl1, tree decl2)
28835 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
28836 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
28838 return (int)priority1 - (int)priority2;
28841 /* V1 and V2 point to function versions with different priorities
28842 based on the target ISA. This function compares their priorities. */
28844 static int
28845 feature_compare (const void *v1, const void *v2)
28847 typedef struct _function_version_info
28849 tree version_decl;
28850 tree predicate_chain;
28851 unsigned int dispatch_priority;
28852 } function_version_info;
28854 const function_version_info c1 = *(const function_version_info *)v1;
28855 const function_version_info c2 = *(const function_version_info *)v2;
28856 return (c2.dispatch_priority - c1.dispatch_priority);
28859 /* This function generates the dispatch function for
28860 multi-versioned functions. DISPATCH_DECL is the function which will
28861 contain the dispatch logic. FNDECLS are the function choices for
28862 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
28863 in DISPATCH_DECL in which the dispatch code is generated. */
28865 static int
28866 dispatch_function_versions (tree dispatch_decl,
28867 void *fndecls_p,
28868 basic_block *empty_bb)
28870 tree default_decl;
28871 gimple ifunc_cpu_init_stmt;
28872 gimple_seq gseq;
28873 int ix;
28874 tree ele;
28875 vec<tree> *fndecls;
28876 unsigned int num_versions = 0;
28877 unsigned int actual_versions = 0;
28878 unsigned int i;
28880 struct _function_version_info
28882 tree version_decl;
28883 tree predicate_chain;
28884 unsigned int dispatch_priority;
28885 }*function_version_info;
28887 gcc_assert (dispatch_decl != NULL
28888 && fndecls_p != NULL
28889 && empty_bb != NULL);
28891 /*fndecls_p is actually a vector. */
28892 fndecls = static_cast<vec<tree> *> (fndecls_p);
28894 /* At least one more version other than the default. */
28895 num_versions = fndecls->length ();
28896 gcc_assert (num_versions >= 2);
28898 function_version_info = (struct _function_version_info *)
28899 XNEWVEC (struct _function_version_info, (num_versions - 1));
28901 /* The first version in the vector is the default decl. */
28902 default_decl = (*fndecls)[0];
28904 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
28906 gseq = bb_seq (*empty_bb);
28907 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
28908 constructors, so explicity call __builtin_cpu_init here. */
28909 ifunc_cpu_init_stmt = gimple_build_call_vec (
28910 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
28911 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
28912 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
28913 set_bb_seq (*empty_bb, gseq);
28915 pop_cfun ();
28918 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
28920 tree version_decl = ele;
28921 tree predicate_chain = NULL_TREE;
28922 unsigned int priority;
28923 /* Get attribute string, parse it and find the right predicate decl.
28924 The predicate function could be a lengthy combination of many
28925 features, like arch-type and various isa-variants. */
28926 priority = get_builtin_code_for_version (version_decl,
28927 &predicate_chain);
28929 if (predicate_chain == NULL_TREE)
28930 continue;
28932 actual_versions++;
28933 function_version_info [ix - 1].version_decl = version_decl;
28934 function_version_info [ix - 1].predicate_chain = predicate_chain;
28935 function_version_info [ix - 1].dispatch_priority = priority;
28938 /* Sort the versions according to descending order of dispatch priority. The
28939 priority is based on the ISA. This is not a perfect solution. There
28940 could still be ambiguity. If more than one function version is suitable
28941 to execute, which one should be dispatched? In future, allow the user
28942 to specify a dispatch priority next to the version. */
28943 qsort (function_version_info, actual_versions,
28944 sizeof (struct _function_version_info), feature_compare);
28946 for (i = 0; i < actual_versions; ++i)
28947 *empty_bb = add_condition_to_bb (dispatch_decl,
28948 function_version_info[i].version_decl,
28949 function_version_info[i].predicate_chain,
28950 *empty_bb);
28952 /* dispatch default version at the end. */
28953 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
28954 NULL, *empty_bb);
28956 free (function_version_info);
28957 return 0;
28960 /* Comparator function to be used in qsort routine to sort attribute
28961 specification strings to "target". */
28963 static int
28964 attr_strcmp (const void *v1, const void *v2)
28966 const char *c1 = *(char *const*)v1;
28967 const char *c2 = *(char *const*)v2;
28968 return strcmp (c1, c2);
28971 /* ARGLIST is the argument to target attribute. This function tokenizes
28972 the comma separated arguments, sorts them and returns a string which
28973 is a unique identifier for the comma separated arguments. It also
28974 replaces non-identifier characters "=,-" with "_". */
28976 static char *
28977 sorted_attr_string (tree arglist)
28979 tree arg;
28980 size_t str_len_sum = 0;
28981 char **args = NULL;
28982 char *attr_str, *ret_str;
28983 char *attr = NULL;
28984 unsigned int argnum = 1;
28985 unsigned int i;
28987 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
28989 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
28990 size_t len = strlen (str);
28991 str_len_sum += len + 1;
28992 if (arg != arglist)
28993 argnum++;
28994 for (i = 0; i < strlen (str); i++)
28995 if (str[i] == ',')
28996 argnum++;
28999 attr_str = XNEWVEC (char, str_len_sum);
29000 str_len_sum = 0;
29001 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
29003 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
29004 size_t len = strlen (str);
29005 memcpy (attr_str + str_len_sum, str, len);
29006 attr_str[str_len_sum + len] = TREE_CHAIN (arg) ? ',' : '\0';
29007 str_len_sum += len + 1;
29010 /* Replace "=,-" with "_". */
29011 for (i = 0; i < strlen (attr_str); i++)
29012 if (attr_str[i] == '=' || attr_str[i]== '-')
29013 attr_str[i] = '_';
29015 if (argnum == 1)
29016 return attr_str;
29018 args = XNEWVEC (char *, argnum);
29020 i = 0;
29021 attr = strtok (attr_str, ",");
29022 while (attr != NULL)
29024 args[i] = attr;
29025 i++;
29026 attr = strtok (NULL, ",");
29029 qsort (args, argnum, sizeof (char *), attr_strcmp);
29031 ret_str = XNEWVEC (char, str_len_sum);
29032 str_len_sum = 0;
29033 for (i = 0; i < argnum; i++)
29035 size_t len = strlen (args[i]);
29036 memcpy (ret_str + str_len_sum, args[i], len);
29037 ret_str[str_len_sum + len] = i < argnum - 1 ? '_' : '\0';
29038 str_len_sum += len + 1;
29041 XDELETEVEC (args);
29042 XDELETEVEC (attr_str);
29043 return ret_str;
29046 /* This function changes the assembler name for functions that are
29047 versions. If DECL is a function version and has a "target"
29048 attribute, it appends the attribute string to its assembler name. */
29050 static tree
29051 ix86_mangle_function_version_assembler_name (tree decl, tree id)
29053 tree version_attr;
29054 const char *orig_name, *version_string;
29055 char *attr_str, *assembler_name;
29057 if (DECL_DECLARED_INLINE_P (decl)
29058 && lookup_attribute ("gnu_inline",
29059 DECL_ATTRIBUTES (decl)))
29060 error_at (DECL_SOURCE_LOCATION (decl),
29061 "Function versions cannot be marked as gnu_inline,"
29062 " bodies have to be generated");
29064 if (DECL_VIRTUAL_P (decl)
29065 || DECL_VINDEX (decl))
29066 sorry ("Virtual function multiversioning not supported");
29068 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
29070 /* target attribute string cannot be NULL. */
29071 gcc_assert (version_attr != NULL_TREE);
29073 orig_name = IDENTIFIER_POINTER (id);
29074 version_string
29075 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
29077 if (strcmp (version_string, "default") == 0)
29078 return id;
29080 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
29081 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
29083 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
29085 /* Allow assembler name to be modified if already set. */
29086 if (DECL_ASSEMBLER_NAME_SET_P (decl))
29087 SET_DECL_RTL (decl, NULL);
29089 tree ret = get_identifier (assembler_name);
29090 XDELETEVEC (attr_str);
29091 XDELETEVEC (assembler_name);
29092 return ret;
29095 /* This function returns true if FN1 and FN2 are versions of the same function,
29096 that is, the target strings of the function decls are different. This assumes
29097 that FN1 and FN2 have the same signature. */
29099 static bool
29100 ix86_function_versions (tree fn1, tree fn2)
29102 tree attr1, attr2;
29103 char *target1, *target2;
29104 bool result;
29106 if (TREE_CODE (fn1) != FUNCTION_DECL
29107 || TREE_CODE (fn2) != FUNCTION_DECL)
29108 return false;
29110 attr1 = lookup_attribute ("target", DECL_ATTRIBUTES (fn1));
29111 attr2 = lookup_attribute ("target", DECL_ATTRIBUTES (fn2));
29113 /* At least one function decl should have the target attribute specified. */
29114 if (attr1 == NULL_TREE && attr2 == NULL_TREE)
29115 return false;
29117 /* Diagnose missing target attribute if one of the decls is already
29118 multi-versioned. */
29119 if (attr1 == NULL_TREE || attr2 == NULL_TREE)
29121 if (DECL_FUNCTION_VERSIONED (fn1) || DECL_FUNCTION_VERSIONED (fn2))
29123 if (attr2 != NULL_TREE)
29125 tree tem = fn1;
29126 fn1 = fn2;
29127 fn2 = tem;
29128 attr1 = attr2;
29130 error_at (DECL_SOURCE_LOCATION (fn2),
29131 "missing %<target%> attribute for multi-versioned %D",
29132 fn2);
29133 error_at (DECL_SOURCE_LOCATION (fn1),
29134 "previous declaration of %D", fn1);
29135 /* Prevent diagnosing of the same error multiple times. */
29136 DECL_ATTRIBUTES (fn2)
29137 = tree_cons (get_identifier ("target"),
29138 copy_node (TREE_VALUE (attr1)),
29139 DECL_ATTRIBUTES (fn2));
29141 return false;
29144 target1 = sorted_attr_string (TREE_VALUE (attr1));
29145 target2 = sorted_attr_string (TREE_VALUE (attr2));
29147 /* The sorted target strings must be different for fn1 and fn2
29148 to be versions. */
29149 if (strcmp (target1, target2) == 0)
29150 result = false;
29151 else
29152 result = true;
29154 XDELETEVEC (target1);
29155 XDELETEVEC (target2);
29157 return result;
29160 static tree
29161 ix86_mangle_decl_assembler_name (tree decl, tree id)
29163 /* For function version, add the target suffix to the assembler name. */
29164 if (TREE_CODE (decl) == FUNCTION_DECL
29165 && DECL_FUNCTION_VERSIONED (decl))
29166 id = ix86_mangle_function_version_assembler_name (decl, id);
29167 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
29168 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
29169 #endif
29171 return id;
29174 /* Return a new name by appending SUFFIX to the DECL name. If make_unique
29175 is true, append the full path name of the source file. */
29177 static char *
29178 make_name (tree decl, const char *suffix, bool make_unique)
29180 char *global_var_name;
29181 int name_len;
29182 const char *name;
29183 const char *unique_name = NULL;
29185 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
29187 /* Get a unique name that can be used globally without any chances
29188 of collision at link time. */
29189 if (make_unique)
29190 unique_name = IDENTIFIER_POINTER (get_file_function_name ("\0"));
29192 name_len = strlen (name) + strlen (suffix) + 2;
29194 if (make_unique)
29195 name_len += strlen (unique_name) + 1;
29196 global_var_name = XNEWVEC (char, name_len);
29198 /* Use '.' to concatenate names as it is demangler friendly. */
29199 if (make_unique)
29200 snprintf (global_var_name, name_len, "%s.%s.%s", name, unique_name,
29201 suffix);
29202 else
29203 snprintf (global_var_name, name_len, "%s.%s", name, suffix);
29205 return global_var_name;
29208 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE) && HAVE_GNU_INDIRECT_FUNCTION
29210 /* Make a dispatcher declaration for the multi-versioned function DECL.
29211 Calls to DECL function will be replaced with calls to the dispatcher
29212 by the front-end. Return the decl created. */
29214 static tree
29215 make_dispatcher_decl (const tree decl)
29217 tree func_decl;
29218 char *func_name;
29219 tree fn_type, func_type;
29220 bool is_uniq = false;
29222 if (TREE_PUBLIC (decl) == 0)
29223 is_uniq = true;
29225 func_name = make_name (decl, "ifunc", is_uniq);
29227 fn_type = TREE_TYPE (decl);
29228 func_type = build_function_type (TREE_TYPE (fn_type),
29229 TYPE_ARG_TYPES (fn_type));
29231 func_decl = build_fn_decl (func_name, func_type);
29232 XDELETEVEC (func_name);
29233 TREE_USED (func_decl) = 1;
29234 DECL_CONTEXT (func_decl) = NULL_TREE;
29235 DECL_INITIAL (func_decl) = error_mark_node;
29236 DECL_ARTIFICIAL (func_decl) = 1;
29237 /* Mark this func as external, the resolver will flip it again if
29238 it gets generated. */
29239 DECL_EXTERNAL (func_decl) = 1;
29240 /* This will be of type IFUNCs have to be externally visible. */
29241 TREE_PUBLIC (func_decl) = 1;
29243 return func_decl;
29246 #endif
29248 /* Returns true if decl is multi-versioned and DECL is the default function,
29249 that is it is not tagged with target specific optimization. */
29251 static bool
29252 is_function_default_version (const tree decl)
29254 if (TREE_CODE (decl) != FUNCTION_DECL
29255 || !DECL_FUNCTION_VERSIONED (decl))
29256 return false;
29257 tree attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
29258 gcc_assert (attr);
29259 attr = TREE_VALUE (TREE_VALUE (attr));
29260 return (TREE_CODE (attr) == STRING_CST
29261 && strcmp (TREE_STRING_POINTER (attr), "default") == 0);
29264 /* Make a dispatcher declaration for the multi-versioned function DECL.
29265 Calls to DECL function will be replaced with calls to the dispatcher
29266 by the front-end. Returns the decl of the dispatcher function. */
29268 static tree
29269 ix86_get_function_versions_dispatcher (void *decl)
29271 tree fn = (tree) decl;
29272 struct cgraph_node *node = NULL;
29273 struct cgraph_node *default_node = NULL;
29274 struct cgraph_function_version_info *node_v = NULL;
29275 struct cgraph_function_version_info *first_v = NULL;
29277 tree dispatch_decl = NULL;
29279 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE) && HAVE_GNU_INDIRECT_FUNCTION
29280 struct cgraph_function_version_info *it_v = NULL;
29281 struct cgraph_node *dispatcher_node = NULL;
29282 struct cgraph_function_version_info *dispatcher_version_info = NULL;
29283 #endif
29285 struct cgraph_function_version_info *default_version_info = NULL;
29287 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
29289 node = cgraph_get_node (fn);
29290 gcc_assert (node != NULL);
29292 node_v = get_cgraph_node_version (node);
29293 gcc_assert (node_v != NULL);
29295 if (node_v->dispatcher_resolver != NULL)
29296 return node_v->dispatcher_resolver;
29298 /* Find the default version and make it the first node. */
29299 first_v = node_v;
29300 /* Go to the beginnig of the chain. */
29301 while (first_v->prev != NULL)
29302 first_v = first_v->prev;
29303 default_version_info = first_v;
29304 while (default_version_info != NULL)
29306 if (is_function_default_version
29307 (default_version_info->this_node->symbol.decl))
29308 break;
29309 default_version_info = default_version_info->next;
29312 /* If there is no default node, just return NULL. */
29313 if (default_version_info == NULL)
29314 return NULL;
29316 /* Make default info the first node. */
29317 if (first_v != default_version_info)
29319 default_version_info->prev->next = default_version_info->next;
29320 if (default_version_info->next)
29321 default_version_info->next->prev = default_version_info->prev;
29322 first_v->prev = default_version_info;
29323 default_version_info->next = first_v;
29324 default_version_info->prev = NULL;
29327 default_node = default_version_info->this_node;
29329 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE) && HAVE_GNU_INDIRECT_FUNCTION
29330 /* Right now, the dispatching is done via ifunc. */
29331 dispatch_decl = make_dispatcher_decl (default_node->symbol.decl);
29333 dispatcher_node = cgraph_get_create_node (dispatch_decl);
29334 gcc_assert (dispatcher_node != NULL);
29335 dispatcher_node->dispatcher_function = 1;
29336 dispatcher_version_info
29337 = insert_new_cgraph_node_version (dispatcher_node);
29338 dispatcher_version_info->next = default_version_info;
29339 dispatcher_node->local.finalized = 1;
29341 /* Set the dispatcher for all the versions. */
29342 it_v = default_version_info;
29343 while (it_v != NULL)
29345 it_v->dispatcher_resolver = dispatch_decl;
29346 it_v = it_v->next;
29348 #else
29349 error_at (DECL_SOURCE_LOCATION (default_node->symbol.decl),
29350 "multiversioning needs ifunc which is not supported "
29351 "in this configuration");
29352 #endif
29353 return dispatch_decl;
29356 /* Makes a function attribute of the form NAME(ARG_NAME) and chains
29357 it to CHAIN. */
29359 static tree
29360 make_attribute (const char *name, const char *arg_name, tree chain)
29362 tree attr_name;
29363 tree attr_arg_name;
29364 tree attr_args;
29365 tree attr;
29367 attr_name = get_identifier (name);
29368 attr_arg_name = build_string (strlen (arg_name), arg_name);
29369 attr_args = tree_cons (NULL_TREE, attr_arg_name, NULL_TREE);
29370 attr = tree_cons (attr_name, attr_args, chain);
29371 return attr;
29374 /* Make the resolver function decl to dispatch the versions of
29375 a multi-versioned function, DEFAULT_DECL. Create an
29376 empty basic block in the resolver and store the pointer in
29377 EMPTY_BB. Return the decl of the resolver function. */
29379 static tree
29380 make_resolver_func (const tree default_decl,
29381 const tree dispatch_decl,
29382 basic_block *empty_bb)
29384 char *resolver_name;
29385 tree decl, type, decl_name, t;
29386 bool is_uniq = false;
29388 /* IFUNC's have to be globally visible. So, if the default_decl is
29389 not, then the name of the IFUNC should be made unique. */
29390 if (TREE_PUBLIC (default_decl) == 0)
29391 is_uniq = true;
29393 /* Append the filename to the resolver function if the versions are
29394 not externally visible. This is because the resolver function has
29395 to be externally visible for the loader to find it. So, appending
29396 the filename will prevent conflicts with a resolver function from
29397 another module which is based on the same version name. */
29398 resolver_name = make_name (default_decl, "resolver", is_uniq);
29400 /* The resolver function should return a (void *). */
29401 type = build_function_type_list (ptr_type_node, NULL_TREE);
29403 decl = build_fn_decl (resolver_name, type);
29404 decl_name = get_identifier (resolver_name);
29405 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
29407 DECL_NAME (decl) = decl_name;
29408 TREE_USED (decl) = 1;
29409 DECL_ARTIFICIAL (decl) = 1;
29410 DECL_IGNORED_P (decl) = 0;
29411 /* IFUNC resolvers have to be externally visible. */
29412 TREE_PUBLIC (decl) = 1;
29413 DECL_UNINLINABLE (decl) = 0;
29415 /* Resolver is not external, body is generated. */
29416 DECL_EXTERNAL (decl) = 0;
29417 DECL_EXTERNAL (dispatch_decl) = 0;
29419 DECL_CONTEXT (decl) = NULL_TREE;
29420 DECL_INITIAL (decl) = make_node (BLOCK);
29421 DECL_STATIC_CONSTRUCTOR (decl) = 0;
29423 if (DECL_COMDAT_GROUP (default_decl)
29424 || TREE_PUBLIC (default_decl))
29426 /* In this case, each translation unit with a call to this
29427 versioned function will put out a resolver. Ensure it
29428 is comdat to keep just one copy. */
29429 DECL_COMDAT (decl) = 1;
29430 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
29432 /* Build result decl and add to function_decl. */
29433 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
29434 DECL_ARTIFICIAL (t) = 1;
29435 DECL_IGNORED_P (t) = 1;
29436 DECL_RESULT (decl) = t;
29438 gimplify_function_tree (decl);
29439 push_cfun (DECL_STRUCT_FUNCTION (decl));
29440 *empty_bb = init_lowered_empty_function (decl, false);
29442 cgraph_add_new_function (decl, true);
29443 cgraph_call_function_insertion_hooks (cgraph_get_create_node (decl));
29445 pop_cfun ();
29447 gcc_assert (dispatch_decl != NULL);
29448 /* Mark dispatch_decl as "ifunc" with resolver as resolver_name. */
29449 DECL_ATTRIBUTES (dispatch_decl)
29450 = make_attribute ("ifunc", resolver_name, DECL_ATTRIBUTES (dispatch_decl));
29452 /* Create the alias for dispatch to resolver here. */
29453 /*cgraph_create_function_alias (dispatch_decl, decl);*/
29454 cgraph_same_body_alias (NULL, dispatch_decl, decl);
29455 XDELETEVEC (resolver_name);
29456 return decl;
29459 /* Generate the dispatching code body to dispatch multi-versioned function
29460 DECL. The target hook is called to process the "target" attributes and
29461 provide the code to dispatch the right function at run-time. NODE points
29462 to the dispatcher decl whose body will be created. */
29464 static tree
29465 ix86_generate_version_dispatcher_body (void *node_p)
29467 tree resolver_decl;
29468 basic_block empty_bb;
29469 vec<tree> fn_ver_vec = vNULL;
29470 tree default_ver_decl;
29471 struct cgraph_node *versn;
29472 struct cgraph_node *node;
29474 struct cgraph_function_version_info *node_version_info = NULL;
29475 struct cgraph_function_version_info *versn_info = NULL;
29477 node = (cgraph_node *)node_p;
29479 node_version_info = get_cgraph_node_version (node);
29480 gcc_assert (node->dispatcher_function
29481 && node_version_info != NULL);
29483 if (node_version_info->dispatcher_resolver)
29484 return node_version_info->dispatcher_resolver;
29486 /* The first version in the chain corresponds to the default version. */
29487 default_ver_decl = node_version_info->next->this_node->symbol.decl;
29489 /* node is going to be an alias, so remove the finalized bit. */
29490 node->local.finalized = false;
29492 resolver_decl = make_resolver_func (default_ver_decl,
29493 node->symbol.decl, &empty_bb);
29495 node_version_info->dispatcher_resolver = resolver_decl;
29497 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
29499 fn_ver_vec.create (2);
29501 for (versn_info = node_version_info->next; versn_info;
29502 versn_info = versn_info->next)
29504 versn = versn_info->this_node;
29505 /* Check for virtual functions here again, as by this time it should
29506 have been determined if this function needs a vtable index or
29507 not. This happens for methods in derived classes that override
29508 virtual methods in base classes but are not explicitly marked as
29509 virtual. */
29510 if (DECL_VINDEX (versn->symbol.decl))
29511 sorry ("Virtual function multiversioning not supported");
29513 fn_ver_vec.safe_push (versn->symbol.decl);
29516 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
29517 fn_ver_vec.release ();
29518 rebuild_cgraph_edges ();
29519 pop_cfun ();
29520 return resolver_decl;
29522 /* This builds the processor_model struct type defined in
29523 libgcc/config/i386/cpuinfo.c */
29525 static tree
29526 build_processor_model_struct (void)
29528 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
29529 "__cpu_features"};
29530 tree field = NULL_TREE, field_chain = NULL_TREE;
29531 int i;
29532 tree type = make_node (RECORD_TYPE);
29534 /* The first 3 fields are unsigned int. */
29535 for (i = 0; i < 3; ++i)
29537 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
29538 get_identifier (field_name[i]), unsigned_type_node);
29539 if (field_chain != NULL_TREE)
29540 DECL_CHAIN (field) = field_chain;
29541 field_chain = field;
29544 /* The last field is an array of unsigned integers of size one. */
29545 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
29546 get_identifier (field_name[3]),
29547 build_array_type (unsigned_type_node,
29548 build_index_type (size_one_node)));
29549 if (field_chain != NULL_TREE)
29550 DECL_CHAIN (field) = field_chain;
29551 field_chain = field;
29553 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
29554 return type;
29557 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
29559 static tree
29560 make_var_decl (tree type, const char *name)
29562 tree new_decl;
29564 new_decl = build_decl (UNKNOWN_LOCATION,
29565 VAR_DECL,
29566 get_identifier(name),
29567 type);
29569 DECL_EXTERNAL (new_decl) = 1;
29570 TREE_STATIC (new_decl) = 1;
29571 TREE_PUBLIC (new_decl) = 1;
29572 DECL_INITIAL (new_decl) = 0;
29573 DECL_ARTIFICIAL (new_decl) = 0;
29574 DECL_PRESERVE_P (new_decl) = 1;
29576 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
29577 assemble_variable (new_decl, 0, 0, 0);
29579 return new_decl;
29582 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
29583 into an integer defined in libgcc/config/i386/cpuinfo.c */
29585 static tree
29586 fold_builtin_cpu (tree fndecl, tree *args)
29588 unsigned int i;
29589 enum ix86_builtins fn_code = (enum ix86_builtins)
29590 DECL_FUNCTION_CODE (fndecl);
29591 tree param_string_cst = NULL;
29593 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
29594 enum processor_features
29596 F_CMOV = 0,
29597 F_MMX,
29598 F_POPCNT,
29599 F_SSE,
29600 F_SSE2,
29601 F_SSE3,
29602 F_SSSE3,
29603 F_SSE4_1,
29604 F_SSE4_2,
29605 F_AVX,
29606 F_AVX2,
29607 F_MAX
29610 /* These are the values for vendor types and cpu types and subtypes
29611 in cpuinfo.c. Cpu types and subtypes should be subtracted by
29612 the corresponding start value. */
29613 enum processor_model
29615 M_INTEL = 1,
29616 M_AMD,
29617 M_CPU_TYPE_START,
29618 M_INTEL_ATOM,
29619 M_INTEL_CORE2,
29620 M_INTEL_COREI7,
29621 M_AMDFAM10H,
29622 M_AMDFAM15H,
29623 M_CPU_SUBTYPE_START,
29624 M_INTEL_COREI7_NEHALEM,
29625 M_INTEL_COREI7_WESTMERE,
29626 M_INTEL_COREI7_SANDYBRIDGE,
29627 M_AMDFAM10H_BARCELONA,
29628 M_AMDFAM10H_SHANGHAI,
29629 M_AMDFAM10H_ISTANBUL,
29630 M_AMDFAM15H_BDVER1,
29631 M_AMDFAM15H_BDVER2,
29632 M_AMDFAM15H_BDVER3
29635 static struct _arch_names_table
29637 const char *const name;
29638 const enum processor_model model;
29640 const arch_names_table[] =
29642 {"amd", M_AMD},
29643 {"intel", M_INTEL},
29644 {"atom", M_INTEL_ATOM},
29645 {"core2", M_INTEL_CORE2},
29646 {"corei7", M_INTEL_COREI7},
29647 {"nehalem", M_INTEL_COREI7_NEHALEM},
29648 {"westmere", M_INTEL_COREI7_WESTMERE},
29649 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
29650 {"amdfam10h", M_AMDFAM10H},
29651 {"barcelona", M_AMDFAM10H_BARCELONA},
29652 {"shanghai", M_AMDFAM10H_SHANGHAI},
29653 {"istanbul", M_AMDFAM10H_ISTANBUL},
29654 {"amdfam15h", M_AMDFAM15H},
29655 {"bdver1", M_AMDFAM15H_BDVER1},
29656 {"bdver2", M_AMDFAM15H_BDVER2},
29657 {"bdver3", M_AMDFAM15H_BDVER3},
29660 static struct _isa_names_table
29662 const char *const name;
29663 const enum processor_features feature;
29665 const isa_names_table[] =
29667 {"cmov", F_CMOV},
29668 {"mmx", F_MMX},
29669 {"popcnt", F_POPCNT},
29670 {"sse", F_SSE},
29671 {"sse2", F_SSE2},
29672 {"sse3", F_SSE3},
29673 {"ssse3", F_SSSE3},
29674 {"sse4.1", F_SSE4_1},
29675 {"sse4.2", F_SSE4_2},
29676 {"avx", F_AVX},
29677 {"avx2", F_AVX2}
29680 tree __processor_model_type = build_processor_model_struct ();
29681 tree __cpu_model_var = make_var_decl (__processor_model_type,
29682 "__cpu_model");
29684 gcc_assert ((args != NULL) && (*args != NULL));
29686 param_string_cst = *args;
29687 while (param_string_cst
29688 && TREE_CODE (param_string_cst) != STRING_CST)
29690 /* *args must be a expr that can contain other EXPRS leading to a
29691 STRING_CST. */
29692 if (!EXPR_P (param_string_cst))
29694 error ("Parameter to builtin must be a string constant or literal");
29695 return integer_zero_node;
29697 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
29700 gcc_assert (param_string_cst);
29702 if (fn_code == IX86_BUILTIN_CPU_IS)
29704 tree ref;
29705 tree field;
29706 tree final;
29708 unsigned int field_val = 0;
29709 unsigned int NUM_ARCH_NAMES
29710 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
29712 for (i = 0; i < NUM_ARCH_NAMES; i++)
29713 if (strcmp (arch_names_table[i].name,
29714 TREE_STRING_POINTER (param_string_cst)) == 0)
29715 break;
29717 if (i == NUM_ARCH_NAMES)
29719 error ("Parameter to builtin not valid: %s",
29720 TREE_STRING_POINTER (param_string_cst));
29721 return integer_zero_node;
29724 field = TYPE_FIELDS (__processor_model_type);
29725 field_val = arch_names_table[i].model;
29727 /* CPU types are stored in the next field. */
29728 if (field_val > M_CPU_TYPE_START
29729 && field_val < M_CPU_SUBTYPE_START)
29731 field = DECL_CHAIN (field);
29732 field_val -= M_CPU_TYPE_START;
29735 /* CPU subtypes are stored in the next field. */
29736 if (field_val > M_CPU_SUBTYPE_START)
29738 field = DECL_CHAIN ( DECL_CHAIN (field));
29739 field_val -= M_CPU_SUBTYPE_START;
29742 /* Get the appropriate field in __cpu_model. */
29743 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
29744 field, NULL_TREE);
29746 /* Check the value. */
29747 final = build2 (EQ_EXPR, unsigned_type_node, ref,
29748 build_int_cstu (unsigned_type_node, field_val));
29749 return build1 (CONVERT_EXPR, integer_type_node, final);
29751 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
29753 tree ref;
29754 tree array_elt;
29755 tree field;
29756 tree final;
29758 unsigned int field_val = 0;
29759 unsigned int NUM_ISA_NAMES
29760 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
29762 for (i = 0; i < NUM_ISA_NAMES; i++)
29763 if (strcmp (isa_names_table[i].name,
29764 TREE_STRING_POINTER (param_string_cst)) == 0)
29765 break;
29767 if (i == NUM_ISA_NAMES)
29769 error ("Parameter to builtin not valid: %s",
29770 TREE_STRING_POINTER (param_string_cst));
29771 return integer_zero_node;
29774 field = TYPE_FIELDS (__processor_model_type);
29775 /* Get the last field, which is __cpu_features. */
29776 while (DECL_CHAIN (field))
29777 field = DECL_CHAIN (field);
29779 /* Get the appropriate field: __cpu_model.__cpu_features */
29780 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
29781 field, NULL_TREE);
29783 /* Access the 0th element of __cpu_features array. */
29784 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
29785 integer_zero_node, NULL_TREE, NULL_TREE);
29787 field_val = (1 << isa_names_table[i].feature);
29788 /* Return __cpu_model.__cpu_features[0] & field_val */
29789 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
29790 build_int_cstu (unsigned_type_node, field_val));
29791 return build1 (CONVERT_EXPR, integer_type_node, final);
29793 gcc_unreachable ();
29796 static tree
29797 ix86_fold_builtin (tree fndecl, int n_args,
29798 tree *args, bool ignore ATTRIBUTE_UNUSED)
29800 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
29802 enum ix86_builtins fn_code = (enum ix86_builtins)
29803 DECL_FUNCTION_CODE (fndecl);
29804 if (fn_code == IX86_BUILTIN_CPU_IS
29805 || fn_code == IX86_BUILTIN_CPU_SUPPORTS)
29807 gcc_assert (n_args == 1);
29808 return fold_builtin_cpu (fndecl, args);
29812 #ifdef SUBTARGET_FOLD_BUILTIN
29813 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
29814 #endif
29816 return NULL_TREE;
29819 /* Make builtins to detect cpu type and features supported. NAME is
29820 the builtin name, CODE is the builtin code, and FTYPE is the function
29821 type of the builtin. */
29823 static void
29824 make_cpu_type_builtin (const char* name, int code,
29825 enum ix86_builtin_func_type ftype, bool is_const)
29827 tree decl;
29828 tree type;
29830 type = ix86_get_builtin_func_type (ftype);
29831 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
29832 NULL, NULL_TREE);
29833 gcc_assert (decl != NULL_TREE);
29834 ix86_builtins[(int) code] = decl;
29835 TREE_READONLY (decl) = is_const;
29838 /* Make builtins to get CPU type and features supported. The created
29839 builtins are :
29841 __builtin_cpu_init (), to detect cpu type and features,
29842 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
29843 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
29846 static void
29847 ix86_init_platform_type_builtins (void)
29849 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
29850 INT_FTYPE_VOID, false);
29851 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
29852 INT_FTYPE_PCCHAR, true);
29853 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
29854 INT_FTYPE_PCCHAR, true);
29857 /* Internal method for ix86_init_builtins. */
29859 static void
29860 ix86_init_builtins_va_builtins_abi (void)
29862 tree ms_va_ref, sysv_va_ref;
29863 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
29864 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
29865 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
29866 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
29868 if (!TARGET_64BIT)
29869 return;
29870 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
29871 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
29872 ms_va_ref = build_reference_type (ms_va_list_type_node);
29873 sysv_va_ref =
29874 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
29876 fnvoid_va_end_ms =
29877 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
29878 fnvoid_va_start_ms =
29879 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
29880 fnvoid_va_end_sysv =
29881 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
29882 fnvoid_va_start_sysv =
29883 build_varargs_function_type_list (void_type_node, sysv_va_ref,
29884 NULL_TREE);
29885 fnvoid_va_copy_ms =
29886 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
29887 NULL_TREE);
29888 fnvoid_va_copy_sysv =
29889 build_function_type_list (void_type_node, sysv_va_ref,
29890 sysv_va_ref, NULL_TREE);
29892 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
29893 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
29894 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
29895 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
29896 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
29897 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
29898 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
29899 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
29900 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
29901 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
29902 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
29903 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
29906 static void
29907 ix86_init_builtin_types (void)
29909 tree float128_type_node, float80_type_node;
29911 /* The __float80 type. */
29912 float80_type_node = long_double_type_node;
29913 if (TYPE_MODE (float80_type_node) != XFmode)
29915 /* The __float80 type. */
29916 float80_type_node = make_node (REAL_TYPE);
29918 TYPE_PRECISION (float80_type_node) = 80;
29919 layout_type (float80_type_node);
29921 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
29923 /* The __float128 type. */
29924 float128_type_node = make_node (REAL_TYPE);
29925 TYPE_PRECISION (float128_type_node) = 128;
29926 layout_type (float128_type_node);
29927 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
29929 /* This macro is built by i386-builtin-types.awk. */
29930 DEFINE_BUILTIN_PRIMITIVE_TYPES;
29933 static void
29934 ix86_init_builtins (void)
29936 tree t;
29938 ix86_init_builtin_types ();
29940 /* Builtins to get CPU type and features. */
29941 ix86_init_platform_type_builtins ();
29943 /* TFmode support builtins. */
29944 def_builtin_const (0, "__builtin_infq",
29945 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
29946 def_builtin_const (0, "__builtin_huge_valq",
29947 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
29949 /* We will expand them to normal call if SSE isn't available since
29950 they are used by libgcc. */
29951 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
29952 t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
29953 BUILT_IN_MD, "__fabstf2", NULL_TREE);
29954 TREE_READONLY (t) = 1;
29955 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
29957 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
29958 t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
29959 BUILT_IN_MD, "__copysigntf3", NULL_TREE);
29960 TREE_READONLY (t) = 1;
29961 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
29963 ix86_init_tm_builtins ();
29964 ix86_init_mmx_sse_builtins ();
29966 if (TARGET_LP64)
29967 ix86_init_builtins_va_builtins_abi ();
29969 #ifdef SUBTARGET_INIT_BUILTINS
29970 SUBTARGET_INIT_BUILTINS;
29971 #endif
29974 /* Return the ix86 builtin for CODE. */
29976 static tree
29977 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
29979 if (code >= IX86_BUILTIN_MAX)
29980 return error_mark_node;
29982 return ix86_builtins[code];
29985 /* Errors in the source file can cause expand_expr to return const0_rtx
29986 where we expect a vector. To avoid crashing, use one of the vector
29987 clear instructions. */
29988 static rtx
29989 safe_vector_operand (rtx x, enum machine_mode mode)
29991 if (x == const0_rtx)
29992 x = CONST0_RTX (mode);
29993 return x;
29996 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
29998 static rtx
29999 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
30001 rtx pat;
30002 tree arg0 = CALL_EXPR_ARG (exp, 0);
30003 tree arg1 = CALL_EXPR_ARG (exp, 1);
30004 rtx op0 = expand_normal (arg0);
30005 rtx op1 = expand_normal (arg1);
30006 enum machine_mode tmode = insn_data[icode].operand[0].mode;
30007 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
30008 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
30010 if (VECTOR_MODE_P (mode0))
30011 op0 = safe_vector_operand (op0, mode0);
30012 if (VECTOR_MODE_P (mode1))
30013 op1 = safe_vector_operand (op1, mode1);
30015 if (optimize || !target
30016 || GET_MODE (target) != tmode
30017 || !insn_data[icode].operand[0].predicate (target, tmode))
30018 target = gen_reg_rtx (tmode);
30020 if (GET_MODE (op1) == SImode && mode1 == TImode)
30022 rtx x = gen_reg_rtx (V4SImode);
30023 emit_insn (gen_sse2_loadd (x, op1));
30024 op1 = gen_lowpart (TImode, x);
30027 if (!insn_data[icode].operand[1].predicate (op0, mode0))
30028 op0 = copy_to_mode_reg (mode0, op0);
30029 if (!insn_data[icode].operand[2].predicate (op1, mode1))
30030 op1 = copy_to_mode_reg (mode1, op1);
30032 pat = GEN_FCN (icode) (target, op0, op1);
30033 if (! pat)
30034 return 0;
30036 emit_insn (pat);
30038 return target;
30041 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
30043 static rtx
30044 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
30045 enum ix86_builtin_func_type m_type,
30046 enum rtx_code sub_code)
30048 rtx pat;
30049 int i;
30050 int nargs;
30051 bool comparison_p = false;
30052 bool tf_p = false;
30053 bool last_arg_constant = false;
30054 int num_memory = 0;
30055 struct {
30056 rtx op;
30057 enum machine_mode mode;
30058 } args[4];
30060 enum machine_mode tmode = insn_data[icode].operand[0].mode;
30062 switch (m_type)
30064 case MULTI_ARG_4_DF2_DI_I:
30065 case MULTI_ARG_4_DF2_DI_I1:
30066 case MULTI_ARG_4_SF2_SI_I:
30067 case MULTI_ARG_4_SF2_SI_I1:
30068 nargs = 4;
30069 last_arg_constant = true;
30070 break;
30072 case MULTI_ARG_3_SF:
30073 case MULTI_ARG_3_DF:
30074 case MULTI_ARG_3_SF2:
30075 case MULTI_ARG_3_DF2:
30076 case MULTI_ARG_3_DI:
30077 case MULTI_ARG_3_SI:
30078 case MULTI_ARG_3_SI_DI:
30079 case MULTI_ARG_3_HI:
30080 case MULTI_ARG_3_HI_SI:
30081 case MULTI_ARG_3_QI:
30082 case MULTI_ARG_3_DI2:
30083 case MULTI_ARG_3_SI2:
30084 case MULTI_ARG_3_HI2:
30085 case MULTI_ARG_3_QI2:
30086 nargs = 3;
30087 break;
30089 case MULTI_ARG_2_SF:
30090 case MULTI_ARG_2_DF:
30091 case MULTI_ARG_2_DI:
30092 case MULTI_ARG_2_SI:
30093 case MULTI_ARG_2_HI:
30094 case MULTI_ARG_2_QI:
30095 nargs = 2;
30096 break;
30098 case MULTI_ARG_2_DI_IMM:
30099 case MULTI_ARG_2_SI_IMM:
30100 case MULTI_ARG_2_HI_IMM:
30101 case MULTI_ARG_2_QI_IMM:
30102 nargs = 2;
30103 last_arg_constant = true;
30104 break;
30106 case MULTI_ARG_1_SF:
30107 case MULTI_ARG_1_DF:
30108 case MULTI_ARG_1_SF2:
30109 case MULTI_ARG_1_DF2:
30110 case MULTI_ARG_1_DI:
30111 case MULTI_ARG_1_SI:
30112 case MULTI_ARG_1_HI:
30113 case MULTI_ARG_1_QI:
30114 case MULTI_ARG_1_SI_DI:
30115 case MULTI_ARG_1_HI_DI:
30116 case MULTI_ARG_1_HI_SI:
30117 case MULTI_ARG_1_QI_DI:
30118 case MULTI_ARG_1_QI_SI:
30119 case MULTI_ARG_1_QI_HI:
30120 nargs = 1;
30121 break;
30123 case MULTI_ARG_2_DI_CMP:
30124 case MULTI_ARG_2_SI_CMP:
30125 case MULTI_ARG_2_HI_CMP:
30126 case MULTI_ARG_2_QI_CMP:
30127 nargs = 2;
30128 comparison_p = true;
30129 break;
30131 case MULTI_ARG_2_SF_TF:
30132 case MULTI_ARG_2_DF_TF:
30133 case MULTI_ARG_2_DI_TF:
30134 case MULTI_ARG_2_SI_TF:
30135 case MULTI_ARG_2_HI_TF:
30136 case MULTI_ARG_2_QI_TF:
30137 nargs = 2;
30138 tf_p = true;
30139 break;
30141 default:
30142 gcc_unreachable ();
30145 if (optimize || !target
30146 || GET_MODE (target) != tmode
30147 || !insn_data[icode].operand[0].predicate (target, tmode))
30148 target = gen_reg_rtx (tmode);
30150 gcc_assert (nargs <= 4);
30152 for (i = 0; i < nargs; i++)
30154 tree arg = CALL_EXPR_ARG (exp, i);
30155 rtx op = expand_normal (arg);
30156 int adjust = (comparison_p) ? 1 : 0;
30157 enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
30159 if (last_arg_constant && i == nargs - 1)
30161 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
30163 enum insn_code new_icode = icode;
30164 switch (icode)
30166 case CODE_FOR_xop_vpermil2v2df3:
30167 case CODE_FOR_xop_vpermil2v4sf3:
30168 case CODE_FOR_xop_vpermil2v4df3:
30169 case CODE_FOR_xop_vpermil2v8sf3:
30170 error ("the last argument must be a 2-bit immediate");
30171 return gen_reg_rtx (tmode);
30172 case CODE_FOR_xop_rotlv2di3:
30173 new_icode = CODE_FOR_rotlv2di3;
30174 goto xop_rotl;
30175 case CODE_FOR_xop_rotlv4si3:
30176 new_icode = CODE_FOR_rotlv4si3;
30177 goto xop_rotl;
30178 case CODE_FOR_xop_rotlv8hi3:
30179 new_icode = CODE_FOR_rotlv8hi3;
30180 goto xop_rotl;
30181 case CODE_FOR_xop_rotlv16qi3:
30182 new_icode = CODE_FOR_rotlv16qi3;
30183 xop_rotl:
30184 if (CONST_INT_P (op))
30186 int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
30187 op = GEN_INT (INTVAL (op) & mask);
30188 gcc_checking_assert
30189 (insn_data[icode].operand[i + 1].predicate (op, mode));
30191 else
30193 gcc_checking_assert
30194 (nargs == 2
30195 && insn_data[new_icode].operand[0].mode == tmode
30196 && insn_data[new_icode].operand[1].mode == tmode
30197 && insn_data[new_icode].operand[2].mode == mode
30198 && insn_data[new_icode].operand[0].predicate
30199 == insn_data[icode].operand[0].predicate
30200 && insn_data[new_icode].operand[1].predicate
30201 == insn_data[icode].operand[1].predicate);
30202 icode = new_icode;
30203 goto non_constant;
30205 break;
30206 default:
30207 gcc_unreachable ();
30211 else
30213 non_constant:
30214 if (VECTOR_MODE_P (mode))
30215 op = safe_vector_operand (op, mode);
30217 /* If we aren't optimizing, only allow one memory operand to be
30218 generated. */
30219 if (memory_operand (op, mode))
30220 num_memory++;
30222 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
30224 if (optimize
30225 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
30226 || num_memory > 1)
30227 op = force_reg (mode, op);
30230 args[i].op = op;
30231 args[i].mode = mode;
30234 switch (nargs)
30236 case 1:
30237 pat = GEN_FCN (icode) (target, args[0].op);
30238 break;
30240 case 2:
30241 if (tf_p)
30242 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
30243 GEN_INT ((int)sub_code));
30244 else if (! comparison_p)
30245 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
30246 else
30248 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
30249 args[0].op,
30250 args[1].op);
30252 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
30254 break;
30256 case 3:
30257 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
30258 break;
30260 case 4:
30261 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
30262 break;
30264 default:
30265 gcc_unreachable ();
30268 if (! pat)
30269 return 0;
30271 emit_insn (pat);
30272 return target;
30275 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
30276 insns with vec_merge. */
30278 static rtx
30279 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
30280 rtx target)
30282 rtx pat;
30283 tree arg0 = CALL_EXPR_ARG (exp, 0);
30284 rtx op1, op0 = expand_normal (arg0);
30285 enum machine_mode tmode = insn_data[icode].operand[0].mode;
30286 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
30288 if (optimize || !target
30289 || GET_MODE (target) != tmode
30290 || !insn_data[icode].operand[0].predicate (target, tmode))
30291 target = gen_reg_rtx (tmode);
30293 if (VECTOR_MODE_P (mode0))
30294 op0 = safe_vector_operand (op0, mode0);
30296 if ((optimize && !register_operand (op0, mode0))
30297 || !insn_data[icode].operand[1].predicate (op0, mode0))
30298 op0 = copy_to_mode_reg (mode0, op0);
30300 op1 = op0;
30301 if (!insn_data[icode].operand[2].predicate (op1, mode0))
30302 op1 = copy_to_mode_reg (mode0, op1);
30304 pat = GEN_FCN (icode) (target, op0, op1);
30305 if (! pat)
30306 return 0;
30307 emit_insn (pat);
30308 return target;
30311 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
30313 static rtx
30314 ix86_expand_sse_compare (const struct builtin_description *d,
30315 tree exp, rtx target, bool swap)
30317 rtx pat;
30318 tree arg0 = CALL_EXPR_ARG (exp, 0);
30319 tree arg1 = CALL_EXPR_ARG (exp, 1);
30320 rtx op0 = expand_normal (arg0);
30321 rtx op1 = expand_normal (arg1);
30322 rtx op2;
30323 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
30324 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
30325 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
30326 enum rtx_code comparison = d->comparison;
30328 if (VECTOR_MODE_P (mode0))
30329 op0 = safe_vector_operand (op0, mode0);
30330 if (VECTOR_MODE_P (mode1))
30331 op1 = safe_vector_operand (op1, mode1);
30333 /* Swap operands if we have a comparison that isn't available in
30334 hardware. */
30335 if (swap)
30337 rtx tmp = gen_reg_rtx (mode1);
30338 emit_move_insn (tmp, op1);
30339 op1 = op0;
30340 op0 = tmp;
30343 if (optimize || !target
30344 || GET_MODE (target) != tmode
30345 || !insn_data[d->icode].operand[0].predicate (target, tmode))
30346 target = gen_reg_rtx (tmode);
30348 if ((optimize && !register_operand (op0, mode0))
30349 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
30350 op0 = copy_to_mode_reg (mode0, op0);
30351 if ((optimize && !register_operand (op1, mode1))
30352 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
30353 op1 = copy_to_mode_reg (mode1, op1);
30355 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
30356 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
30357 if (! pat)
30358 return 0;
30359 emit_insn (pat);
30360 return target;
30363 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
30365 static rtx
30366 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
30367 rtx target)
30369 rtx pat;
30370 tree arg0 = CALL_EXPR_ARG (exp, 0);
30371 tree arg1 = CALL_EXPR_ARG (exp, 1);
30372 rtx op0 = expand_normal (arg0);
30373 rtx op1 = expand_normal (arg1);
30374 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
30375 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
30376 enum rtx_code comparison = d->comparison;
30378 if (VECTOR_MODE_P (mode0))
30379 op0 = safe_vector_operand (op0, mode0);
30380 if (VECTOR_MODE_P (mode1))
30381 op1 = safe_vector_operand (op1, mode1);
30383 /* Swap operands if we have a comparison that isn't available in
30384 hardware. */
30385 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
30387 rtx tmp = op1;
30388 op1 = op0;
30389 op0 = tmp;
30392 target = gen_reg_rtx (SImode);
30393 emit_move_insn (target, const0_rtx);
30394 target = gen_rtx_SUBREG (QImode, target, 0);
30396 if ((optimize && !register_operand (op0, mode0))
30397 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30398 op0 = copy_to_mode_reg (mode0, op0);
30399 if ((optimize && !register_operand (op1, mode1))
30400 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
30401 op1 = copy_to_mode_reg (mode1, op1);
30403 pat = GEN_FCN (d->icode) (op0, op1);
30404 if (! pat)
30405 return 0;
30406 emit_insn (pat);
30407 emit_insn (gen_rtx_SET (VOIDmode,
30408 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30409 gen_rtx_fmt_ee (comparison, QImode,
30410 SET_DEST (pat),
30411 const0_rtx)));
30413 return SUBREG_REG (target);
30416 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
30418 static rtx
30419 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
30420 rtx target)
30422 rtx pat;
30423 tree arg0 = CALL_EXPR_ARG (exp, 0);
30424 rtx op1, op0 = expand_normal (arg0);
30425 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
30426 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
30428 if (optimize || target == 0
30429 || GET_MODE (target) != tmode
30430 || !insn_data[d->icode].operand[0].predicate (target, tmode))
30431 target = gen_reg_rtx (tmode);
30433 if (VECTOR_MODE_P (mode0))
30434 op0 = safe_vector_operand (op0, mode0);
30436 if ((optimize && !register_operand (op0, mode0))
30437 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30438 op0 = copy_to_mode_reg (mode0, op0);
30440 op1 = GEN_INT (d->comparison);
30442 pat = GEN_FCN (d->icode) (target, op0, op1);
30443 if (! pat)
30444 return 0;
30445 emit_insn (pat);
30446 return target;
30449 static rtx
30450 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
30451 tree exp, rtx target)
30453 rtx pat;
30454 tree arg0 = CALL_EXPR_ARG (exp, 0);
30455 tree arg1 = CALL_EXPR_ARG (exp, 1);
30456 rtx op0 = expand_normal (arg0);
30457 rtx op1 = expand_normal (arg1);
30458 rtx op2;
30459 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
30460 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
30461 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
30463 if (optimize || target == 0
30464 || GET_MODE (target) != tmode
30465 || !insn_data[d->icode].operand[0].predicate (target, tmode))
30466 target = gen_reg_rtx (tmode);
30468 op0 = safe_vector_operand (op0, mode0);
30469 op1 = safe_vector_operand (op1, mode1);
30471 if ((optimize && !register_operand (op0, mode0))
30472 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30473 op0 = copy_to_mode_reg (mode0, op0);
30474 if ((optimize && !register_operand (op1, mode1))
30475 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
30476 op1 = copy_to_mode_reg (mode1, op1);
30478 op2 = GEN_INT (d->comparison);
30480 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
30481 if (! pat)
30482 return 0;
30483 emit_insn (pat);
30484 return target;
30487 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
30489 static rtx
30490 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
30491 rtx target)
30493 rtx pat;
30494 tree arg0 = CALL_EXPR_ARG (exp, 0);
30495 tree arg1 = CALL_EXPR_ARG (exp, 1);
30496 rtx op0 = expand_normal (arg0);
30497 rtx op1 = expand_normal (arg1);
30498 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
30499 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
30500 enum rtx_code comparison = d->comparison;
30502 if (VECTOR_MODE_P (mode0))
30503 op0 = safe_vector_operand (op0, mode0);
30504 if (VECTOR_MODE_P (mode1))
30505 op1 = safe_vector_operand (op1, mode1);
30507 target = gen_reg_rtx (SImode);
30508 emit_move_insn (target, const0_rtx);
30509 target = gen_rtx_SUBREG (QImode, target, 0);
30511 if ((optimize && !register_operand (op0, mode0))
30512 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30513 op0 = copy_to_mode_reg (mode0, op0);
30514 if ((optimize && !register_operand (op1, mode1))
30515 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
30516 op1 = copy_to_mode_reg (mode1, op1);
30518 pat = GEN_FCN (d->icode) (op0, op1);
30519 if (! pat)
30520 return 0;
30521 emit_insn (pat);
30522 emit_insn (gen_rtx_SET (VOIDmode,
30523 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30524 gen_rtx_fmt_ee (comparison, QImode,
30525 SET_DEST (pat),
30526 const0_rtx)));
30528 return SUBREG_REG (target);
30531 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
30533 static rtx
30534 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
30535 tree exp, rtx target)
30537 rtx pat;
30538 tree arg0 = CALL_EXPR_ARG (exp, 0);
30539 tree arg1 = CALL_EXPR_ARG (exp, 1);
30540 tree arg2 = CALL_EXPR_ARG (exp, 2);
30541 tree arg3 = CALL_EXPR_ARG (exp, 3);
30542 tree arg4 = CALL_EXPR_ARG (exp, 4);
30543 rtx scratch0, scratch1;
30544 rtx op0 = expand_normal (arg0);
30545 rtx op1 = expand_normal (arg1);
30546 rtx op2 = expand_normal (arg2);
30547 rtx op3 = expand_normal (arg3);
30548 rtx op4 = expand_normal (arg4);
30549 enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
30551 tmode0 = insn_data[d->icode].operand[0].mode;
30552 tmode1 = insn_data[d->icode].operand[1].mode;
30553 modev2 = insn_data[d->icode].operand[2].mode;
30554 modei3 = insn_data[d->icode].operand[3].mode;
30555 modev4 = insn_data[d->icode].operand[4].mode;
30556 modei5 = insn_data[d->icode].operand[5].mode;
30557 modeimm = insn_data[d->icode].operand[6].mode;
30559 if (VECTOR_MODE_P (modev2))
30560 op0 = safe_vector_operand (op0, modev2);
30561 if (VECTOR_MODE_P (modev4))
30562 op2 = safe_vector_operand (op2, modev4);
30564 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
30565 op0 = copy_to_mode_reg (modev2, op0);
30566 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
30567 op1 = copy_to_mode_reg (modei3, op1);
30568 if ((optimize && !register_operand (op2, modev4))
30569 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
30570 op2 = copy_to_mode_reg (modev4, op2);
30571 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
30572 op3 = copy_to_mode_reg (modei5, op3);
30574 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
30576 error ("the fifth argument must be an 8-bit immediate");
30577 return const0_rtx;
30580 if (d->code == IX86_BUILTIN_PCMPESTRI128)
30582 if (optimize || !target
30583 || GET_MODE (target) != tmode0
30584 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
30585 target = gen_reg_rtx (tmode0);
30587 scratch1 = gen_reg_rtx (tmode1);
30589 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
30591 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
30593 if (optimize || !target
30594 || GET_MODE (target) != tmode1
30595 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
30596 target = gen_reg_rtx (tmode1);
30598 scratch0 = gen_reg_rtx (tmode0);
30600 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
30602 else
30604 gcc_assert (d->flag);
30606 scratch0 = gen_reg_rtx (tmode0);
30607 scratch1 = gen_reg_rtx (tmode1);
30609 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
30612 if (! pat)
30613 return 0;
30615 emit_insn (pat);
30617 if (d->flag)
30619 target = gen_reg_rtx (SImode);
30620 emit_move_insn (target, const0_rtx);
30621 target = gen_rtx_SUBREG (QImode, target, 0);
30623 emit_insn
30624 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30625 gen_rtx_fmt_ee (EQ, QImode,
30626 gen_rtx_REG ((enum machine_mode) d->flag,
30627 FLAGS_REG),
30628 const0_rtx)));
30629 return SUBREG_REG (target);
30631 else
30632 return target;
30636 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
30638 static rtx
30639 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
30640 tree exp, rtx target)
30642 rtx pat;
30643 tree arg0 = CALL_EXPR_ARG (exp, 0);
30644 tree arg1 = CALL_EXPR_ARG (exp, 1);
30645 tree arg2 = CALL_EXPR_ARG (exp, 2);
30646 rtx scratch0, scratch1;
30647 rtx op0 = expand_normal (arg0);
30648 rtx op1 = expand_normal (arg1);
30649 rtx op2 = expand_normal (arg2);
30650 enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
30652 tmode0 = insn_data[d->icode].operand[0].mode;
30653 tmode1 = insn_data[d->icode].operand[1].mode;
30654 modev2 = insn_data[d->icode].operand[2].mode;
30655 modev3 = insn_data[d->icode].operand[3].mode;
30656 modeimm = insn_data[d->icode].operand[4].mode;
30658 if (VECTOR_MODE_P (modev2))
30659 op0 = safe_vector_operand (op0, modev2);
30660 if (VECTOR_MODE_P (modev3))
30661 op1 = safe_vector_operand (op1, modev3);
30663 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
30664 op0 = copy_to_mode_reg (modev2, op0);
30665 if ((optimize && !register_operand (op1, modev3))
30666 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
30667 op1 = copy_to_mode_reg (modev3, op1);
30669 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
30671 error ("the third argument must be an 8-bit immediate");
30672 return const0_rtx;
30675 if (d->code == IX86_BUILTIN_PCMPISTRI128)
30677 if (optimize || !target
30678 || GET_MODE (target) != tmode0
30679 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
30680 target = gen_reg_rtx (tmode0);
30682 scratch1 = gen_reg_rtx (tmode1);
30684 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
30686 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
30688 if (optimize || !target
30689 || GET_MODE (target) != tmode1
30690 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
30691 target = gen_reg_rtx (tmode1);
30693 scratch0 = gen_reg_rtx (tmode0);
30695 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
30697 else
30699 gcc_assert (d->flag);
30701 scratch0 = gen_reg_rtx (tmode0);
30702 scratch1 = gen_reg_rtx (tmode1);
30704 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
30707 if (! pat)
30708 return 0;
30710 emit_insn (pat);
30712 if (d->flag)
30714 target = gen_reg_rtx (SImode);
30715 emit_move_insn (target, const0_rtx);
30716 target = gen_rtx_SUBREG (QImode, target, 0);
30718 emit_insn
30719 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30720 gen_rtx_fmt_ee (EQ, QImode,
30721 gen_rtx_REG ((enum machine_mode) d->flag,
30722 FLAGS_REG),
30723 const0_rtx)));
30724 return SUBREG_REG (target);
30726 else
30727 return target;
30730 /* Subroutine of ix86_expand_builtin to take care of insns with
30731 variable number of operands. */
30733 static rtx
30734 ix86_expand_args_builtin (const struct builtin_description *d,
30735 tree exp, rtx target)
30737 rtx pat, real_target;
30738 unsigned int i, nargs;
30739 unsigned int nargs_constant = 0;
30740 int num_memory = 0;
30741 struct
30743 rtx op;
30744 enum machine_mode mode;
30745 } args[4];
30746 bool last_arg_count = false;
30747 enum insn_code icode = d->icode;
30748 const struct insn_data_d *insn_p = &insn_data[icode];
30749 enum machine_mode tmode = insn_p->operand[0].mode;
30750 enum machine_mode rmode = VOIDmode;
30751 bool swap = false;
30752 enum rtx_code comparison = d->comparison;
30754 switch ((enum ix86_builtin_func_type) d->flag)
30756 case V2DF_FTYPE_V2DF_ROUND:
30757 case V4DF_FTYPE_V4DF_ROUND:
30758 case V4SF_FTYPE_V4SF_ROUND:
30759 case V8SF_FTYPE_V8SF_ROUND:
30760 case V4SI_FTYPE_V4SF_ROUND:
30761 case V8SI_FTYPE_V8SF_ROUND:
30762 return ix86_expand_sse_round (d, exp, target);
30763 case V4SI_FTYPE_V2DF_V2DF_ROUND:
30764 case V8SI_FTYPE_V4DF_V4DF_ROUND:
30765 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
30766 case INT_FTYPE_V8SF_V8SF_PTEST:
30767 case INT_FTYPE_V4DI_V4DI_PTEST:
30768 case INT_FTYPE_V4DF_V4DF_PTEST:
30769 case INT_FTYPE_V4SF_V4SF_PTEST:
30770 case INT_FTYPE_V2DI_V2DI_PTEST:
30771 case INT_FTYPE_V2DF_V2DF_PTEST:
30772 return ix86_expand_sse_ptest (d, exp, target);
30773 case FLOAT128_FTYPE_FLOAT128:
30774 case FLOAT_FTYPE_FLOAT:
30775 case INT_FTYPE_INT:
30776 case UINT64_FTYPE_INT:
30777 case UINT16_FTYPE_UINT16:
30778 case INT64_FTYPE_INT64:
30779 case INT64_FTYPE_V4SF:
30780 case INT64_FTYPE_V2DF:
30781 case INT_FTYPE_V16QI:
30782 case INT_FTYPE_V8QI:
30783 case INT_FTYPE_V8SF:
30784 case INT_FTYPE_V4DF:
30785 case INT_FTYPE_V4SF:
30786 case INT_FTYPE_V2DF:
30787 case INT_FTYPE_V32QI:
30788 case V16QI_FTYPE_V16QI:
30789 case V8SI_FTYPE_V8SF:
30790 case V8SI_FTYPE_V4SI:
30791 case V8HI_FTYPE_V8HI:
30792 case V8HI_FTYPE_V16QI:
30793 case V8QI_FTYPE_V8QI:
30794 case V8SF_FTYPE_V8SF:
30795 case V8SF_FTYPE_V8SI:
30796 case V8SF_FTYPE_V4SF:
30797 case V8SF_FTYPE_V8HI:
30798 case V4SI_FTYPE_V4SI:
30799 case V4SI_FTYPE_V16QI:
30800 case V4SI_FTYPE_V4SF:
30801 case V4SI_FTYPE_V8SI:
30802 case V4SI_FTYPE_V8HI:
30803 case V4SI_FTYPE_V4DF:
30804 case V4SI_FTYPE_V2DF:
30805 case V4HI_FTYPE_V4HI:
30806 case V4DF_FTYPE_V4DF:
30807 case V4DF_FTYPE_V4SI:
30808 case V4DF_FTYPE_V4SF:
30809 case V4DF_FTYPE_V2DF:
30810 case V4SF_FTYPE_V4SF:
30811 case V4SF_FTYPE_V4SI:
30812 case V4SF_FTYPE_V8SF:
30813 case V4SF_FTYPE_V4DF:
30814 case V4SF_FTYPE_V8HI:
30815 case V4SF_FTYPE_V2DF:
30816 case V2DI_FTYPE_V2DI:
30817 case V2DI_FTYPE_V16QI:
30818 case V2DI_FTYPE_V8HI:
30819 case V2DI_FTYPE_V4SI:
30820 case V2DF_FTYPE_V2DF:
30821 case V2DF_FTYPE_V4SI:
30822 case V2DF_FTYPE_V4DF:
30823 case V2DF_FTYPE_V4SF:
30824 case V2DF_FTYPE_V2SI:
30825 case V2SI_FTYPE_V2SI:
30826 case V2SI_FTYPE_V4SF:
30827 case V2SI_FTYPE_V2SF:
30828 case V2SI_FTYPE_V2DF:
30829 case V2SF_FTYPE_V2SF:
30830 case V2SF_FTYPE_V2SI:
30831 case V32QI_FTYPE_V32QI:
30832 case V32QI_FTYPE_V16QI:
30833 case V16HI_FTYPE_V16HI:
30834 case V16HI_FTYPE_V8HI:
30835 case V8SI_FTYPE_V8SI:
30836 case V16HI_FTYPE_V16QI:
30837 case V8SI_FTYPE_V16QI:
30838 case V4DI_FTYPE_V16QI:
30839 case V8SI_FTYPE_V8HI:
30840 case V4DI_FTYPE_V8HI:
30841 case V4DI_FTYPE_V4SI:
30842 case V4DI_FTYPE_V2DI:
30843 nargs = 1;
30844 break;
30845 case V4SF_FTYPE_V4SF_VEC_MERGE:
30846 case V2DF_FTYPE_V2DF_VEC_MERGE:
30847 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
30848 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
30849 case V16QI_FTYPE_V16QI_V16QI:
30850 case V16QI_FTYPE_V8HI_V8HI:
30851 case V8QI_FTYPE_V8QI_V8QI:
30852 case V8QI_FTYPE_V4HI_V4HI:
30853 case V8HI_FTYPE_V8HI_V8HI:
30854 case V8HI_FTYPE_V16QI_V16QI:
30855 case V8HI_FTYPE_V4SI_V4SI:
30856 case V8SF_FTYPE_V8SF_V8SF:
30857 case V8SF_FTYPE_V8SF_V8SI:
30858 case V4SI_FTYPE_V4SI_V4SI:
30859 case V4SI_FTYPE_V8HI_V8HI:
30860 case V4SI_FTYPE_V4SF_V4SF:
30861 case V4SI_FTYPE_V2DF_V2DF:
30862 case V4HI_FTYPE_V4HI_V4HI:
30863 case V4HI_FTYPE_V8QI_V8QI:
30864 case V4HI_FTYPE_V2SI_V2SI:
30865 case V4DF_FTYPE_V4DF_V4DF:
30866 case V4DF_FTYPE_V4DF_V4DI:
30867 case V4SF_FTYPE_V4SF_V4SF:
30868 case V4SF_FTYPE_V4SF_V4SI:
30869 case V4SF_FTYPE_V4SF_V2SI:
30870 case V4SF_FTYPE_V4SF_V2DF:
30871 case V4SF_FTYPE_V4SF_DI:
30872 case V4SF_FTYPE_V4SF_SI:
30873 case V2DI_FTYPE_V2DI_V2DI:
30874 case V2DI_FTYPE_V16QI_V16QI:
30875 case V2DI_FTYPE_V4SI_V4SI:
30876 case V2UDI_FTYPE_V4USI_V4USI:
30877 case V2DI_FTYPE_V2DI_V16QI:
30878 case V2DI_FTYPE_V2DF_V2DF:
30879 case V2SI_FTYPE_V2SI_V2SI:
30880 case V2SI_FTYPE_V4HI_V4HI:
30881 case V2SI_FTYPE_V2SF_V2SF:
30882 case V2DF_FTYPE_V2DF_V2DF:
30883 case V2DF_FTYPE_V2DF_V4SF:
30884 case V2DF_FTYPE_V2DF_V2DI:
30885 case V2DF_FTYPE_V2DF_DI:
30886 case V2DF_FTYPE_V2DF_SI:
30887 case V2SF_FTYPE_V2SF_V2SF:
30888 case V1DI_FTYPE_V1DI_V1DI:
30889 case V1DI_FTYPE_V8QI_V8QI:
30890 case V1DI_FTYPE_V2SI_V2SI:
30891 case V32QI_FTYPE_V16HI_V16HI:
30892 case V16HI_FTYPE_V8SI_V8SI:
30893 case V32QI_FTYPE_V32QI_V32QI:
30894 case V16HI_FTYPE_V32QI_V32QI:
30895 case V16HI_FTYPE_V16HI_V16HI:
30896 case V8SI_FTYPE_V4DF_V4DF:
30897 case V8SI_FTYPE_V8SI_V8SI:
30898 case V8SI_FTYPE_V16HI_V16HI:
30899 case V4DI_FTYPE_V4DI_V4DI:
30900 case V4DI_FTYPE_V8SI_V8SI:
30901 case V4UDI_FTYPE_V8USI_V8USI:
30902 if (comparison == UNKNOWN)
30903 return ix86_expand_binop_builtin (icode, exp, target);
30904 nargs = 2;
30905 break;
30906 case V4SF_FTYPE_V4SF_V4SF_SWAP:
30907 case V2DF_FTYPE_V2DF_V2DF_SWAP:
30908 gcc_assert (comparison != UNKNOWN);
30909 nargs = 2;
30910 swap = true;
30911 break;
30912 case V16HI_FTYPE_V16HI_V8HI_COUNT:
30913 case V16HI_FTYPE_V16HI_SI_COUNT:
30914 case V8SI_FTYPE_V8SI_V4SI_COUNT:
30915 case V8SI_FTYPE_V8SI_SI_COUNT:
30916 case V4DI_FTYPE_V4DI_V2DI_COUNT:
30917 case V4DI_FTYPE_V4DI_INT_COUNT:
30918 case V8HI_FTYPE_V8HI_V8HI_COUNT:
30919 case V8HI_FTYPE_V8HI_SI_COUNT:
30920 case V4SI_FTYPE_V4SI_V4SI_COUNT:
30921 case V4SI_FTYPE_V4SI_SI_COUNT:
30922 case V4HI_FTYPE_V4HI_V4HI_COUNT:
30923 case V4HI_FTYPE_V4HI_SI_COUNT:
30924 case V2DI_FTYPE_V2DI_V2DI_COUNT:
30925 case V2DI_FTYPE_V2DI_SI_COUNT:
30926 case V2SI_FTYPE_V2SI_V2SI_COUNT:
30927 case V2SI_FTYPE_V2SI_SI_COUNT:
30928 case V1DI_FTYPE_V1DI_V1DI_COUNT:
30929 case V1DI_FTYPE_V1DI_SI_COUNT:
30930 nargs = 2;
30931 last_arg_count = true;
30932 break;
30933 case UINT64_FTYPE_UINT64_UINT64:
30934 case UINT_FTYPE_UINT_UINT:
30935 case UINT_FTYPE_UINT_USHORT:
30936 case UINT_FTYPE_UINT_UCHAR:
30937 case UINT16_FTYPE_UINT16_INT:
30938 case UINT8_FTYPE_UINT8_INT:
30939 nargs = 2;
30940 break;
30941 case V2DI_FTYPE_V2DI_INT_CONVERT:
30942 nargs = 2;
30943 rmode = V1TImode;
30944 nargs_constant = 1;
30945 break;
30946 case V4DI_FTYPE_V4DI_INT_CONVERT:
30947 nargs = 2;
30948 rmode = V2TImode;
30949 nargs_constant = 1;
30950 break;
30951 case V8HI_FTYPE_V8HI_INT:
30952 case V8HI_FTYPE_V8SF_INT:
30953 case V8HI_FTYPE_V4SF_INT:
30954 case V8SF_FTYPE_V8SF_INT:
30955 case V4SI_FTYPE_V4SI_INT:
30956 case V4SI_FTYPE_V8SI_INT:
30957 case V4HI_FTYPE_V4HI_INT:
30958 case V4DF_FTYPE_V4DF_INT:
30959 case V4SF_FTYPE_V4SF_INT:
30960 case V4SF_FTYPE_V8SF_INT:
30961 case V2DI_FTYPE_V2DI_INT:
30962 case V2DF_FTYPE_V2DF_INT:
30963 case V2DF_FTYPE_V4DF_INT:
30964 case V16HI_FTYPE_V16HI_INT:
30965 case V8SI_FTYPE_V8SI_INT:
30966 case V4DI_FTYPE_V4DI_INT:
30967 case V2DI_FTYPE_V4DI_INT:
30968 nargs = 2;
30969 nargs_constant = 1;
30970 break;
30971 case V16QI_FTYPE_V16QI_V16QI_V16QI:
30972 case V8SF_FTYPE_V8SF_V8SF_V8SF:
30973 case V4DF_FTYPE_V4DF_V4DF_V4DF:
30974 case V4SF_FTYPE_V4SF_V4SF_V4SF:
30975 case V2DF_FTYPE_V2DF_V2DF_V2DF:
30976 case V32QI_FTYPE_V32QI_V32QI_V32QI:
30977 nargs = 3;
30978 break;
30979 case V32QI_FTYPE_V32QI_V32QI_INT:
30980 case V16HI_FTYPE_V16HI_V16HI_INT:
30981 case V16QI_FTYPE_V16QI_V16QI_INT:
30982 case V4DI_FTYPE_V4DI_V4DI_INT:
30983 case V8HI_FTYPE_V8HI_V8HI_INT:
30984 case V8SI_FTYPE_V8SI_V8SI_INT:
30985 case V8SI_FTYPE_V8SI_V4SI_INT:
30986 case V8SF_FTYPE_V8SF_V8SF_INT:
30987 case V8SF_FTYPE_V8SF_V4SF_INT:
30988 case V4SI_FTYPE_V4SI_V4SI_INT:
30989 case V4DF_FTYPE_V4DF_V4DF_INT:
30990 case V4DF_FTYPE_V4DF_V2DF_INT:
30991 case V4SF_FTYPE_V4SF_V4SF_INT:
30992 case V2DI_FTYPE_V2DI_V2DI_INT:
30993 case V4DI_FTYPE_V4DI_V2DI_INT:
30994 case V2DF_FTYPE_V2DF_V2DF_INT:
30995 nargs = 3;
30996 nargs_constant = 1;
30997 break;
30998 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
30999 nargs = 3;
31000 rmode = V4DImode;
31001 nargs_constant = 1;
31002 break;
31003 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
31004 nargs = 3;
31005 rmode = V2DImode;
31006 nargs_constant = 1;
31007 break;
31008 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
31009 nargs = 3;
31010 rmode = DImode;
31011 nargs_constant = 1;
31012 break;
31013 case V2DI_FTYPE_V2DI_UINT_UINT:
31014 nargs = 3;
31015 nargs_constant = 2;
31016 break;
31017 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
31018 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
31019 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
31020 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
31021 nargs = 4;
31022 nargs_constant = 1;
31023 break;
31024 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
31025 nargs = 4;
31026 nargs_constant = 2;
31027 break;
31028 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
31029 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
31030 nargs = 4;
31031 break;
31032 default:
31033 gcc_unreachable ();
31036 gcc_assert (nargs <= ARRAY_SIZE (args));
31038 if (comparison != UNKNOWN)
31040 gcc_assert (nargs == 2);
31041 return ix86_expand_sse_compare (d, exp, target, swap);
31044 if (rmode == VOIDmode || rmode == tmode)
31046 if (optimize
31047 || target == 0
31048 || GET_MODE (target) != tmode
31049 || !insn_p->operand[0].predicate (target, tmode))
31050 target = gen_reg_rtx (tmode);
31051 real_target = target;
31053 else
31055 target = gen_reg_rtx (rmode);
31056 real_target = simplify_gen_subreg (tmode, target, rmode, 0);
31059 for (i = 0; i < nargs; i++)
31061 tree arg = CALL_EXPR_ARG (exp, i);
31062 rtx op = expand_normal (arg);
31063 enum machine_mode mode = insn_p->operand[i + 1].mode;
31064 bool match = insn_p->operand[i + 1].predicate (op, mode);
31066 if (last_arg_count && (i + 1) == nargs)
31068 /* SIMD shift insns take either an 8-bit immediate or
31069 register as count. But builtin functions take int as
31070 count. If count doesn't match, we put it in register. */
31071 if (!match)
31073 op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
31074 if (!insn_p->operand[i + 1].predicate (op, mode))
31075 op = copy_to_reg (op);
31078 else if ((nargs - i) <= nargs_constant)
31080 if (!match)
31081 switch (icode)
31083 case CODE_FOR_avx2_inserti128:
31084 case CODE_FOR_avx2_extracti128:
31085 error ("the last argument must be an 1-bit immediate");
31086 return const0_rtx;
31088 case CODE_FOR_sse4_1_roundsd:
31089 case CODE_FOR_sse4_1_roundss:
31091 case CODE_FOR_sse4_1_roundpd:
31092 case CODE_FOR_sse4_1_roundps:
31093 case CODE_FOR_avx_roundpd256:
31094 case CODE_FOR_avx_roundps256:
31096 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
31097 case CODE_FOR_sse4_1_roundps_sfix:
31098 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
31099 case CODE_FOR_avx_roundps_sfix256:
31101 case CODE_FOR_sse4_1_blendps:
31102 case CODE_FOR_avx_blendpd256:
31103 case CODE_FOR_avx_vpermilv4df:
31104 error ("the last argument must be a 4-bit immediate");
31105 return const0_rtx;
31107 case CODE_FOR_sse4_1_blendpd:
31108 case CODE_FOR_avx_vpermilv2df:
31109 case CODE_FOR_xop_vpermil2v2df3:
31110 case CODE_FOR_xop_vpermil2v4sf3:
31111 case CODE_FOR_xop_vpermil2v4df3:
31112 case CODE_FOR_xop_vpermil2v8sf3:
31113 error ("the last argument must be a 2-bit immediate");
31114 return const0_rtx;
31116 case CODE_FOR_avx_vextractf128v4df:
31117 case CODE_FOR_avx_vextractf128v8sf:
31118 case CODE_FOR_avx_vextractf128v8si:
31119 case CODE_FOR_avx_vinsertf128v4df:
31120 case CODE_FOR_avx_vinsertf128v8sf:
31121 case CODE_FOR_avx_vinsertf128v8si:
31122 error ("the last argument must be a 1-bit immediate");
31123 return const0_rtx;
31125 case CODE_FOR_avx_vmcmpv2df3:
31126 case CODE_FOR_avx_vmcmpv4sf3:
31127 case CODE_FOR_avx_cmpv2df3:
31128 case CODE_FOR_avx_cmpv4sf3:
31129 case CODE_FOR_avx_cmpv4df3:
31130 case CODE_FOR_avx_cmpv8sf3:
31131 error ("the last argument must be a 5-bit immediate");
31132 return const0_rtx;
31134 default:
31135 switch (nargs_constant)
31137 case 2:
31138 if ((nargs - i) == nargs_constant)
31140 error ("the next to last argument must be an 8-bit immediate");
31141 break;
31143 case 1:
31144 error ("the last argument must be an 8-bit immediate");
31145 break;
31146 default:
31147 gcc_unreachable ();
31149 return const0_rtx;
31152 else
31154 if (VECTOR_MODE_P (mode))
31155 op = safe_vector_operand (op, mode);
31157 /* If we aren't optimizing, only allow one memory operand to
31158 be generated. */
31159 if (memory_operand (op, mode))
31160 num_memory++;
31162 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
31164 if (optimize || !match || num_memory > 1)
31165 op = copy_to_mode_reg (mode, op);
31167 else
31169 op = copy_to_reg (op);
31170 op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
31174 args[i].op = op;
31175 args[i].mode = mode;
31178 switch (nargs)
31180 case 1:
31181 pat = GEN_FCN (icode) (real_target, args[0].op);
31182 break;
31183 case 2:
31184 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
31185 break;
31186 case 3:
31187 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
31188 args[2].op);
31189 break;
31190 case 4:
31191 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
31192 args[2].op, args[3].op);
31193 break;
31194 default:
31195 gcc_unreachable ();
31198 if (! pat)
31199 return 0;
31201 emit_insn (pat);
31202 return target;
31205 /* Subroutine of ix86_expand_builtin to take care of special insns
31206 with variable number of operands. */
31208 static rtx
31209 ix86_expand_special_args_builtin (const struct builtin_description *d,
31210 tree exp, rtx target)
31212 tree arg;
31213 rtx pat, op;
31214 unsigned int i, nargs, arg_adjust, memory;
31215 struct
31217 rtx op;
31218 enum machine_mode mode;
31219 } args[3];
31220 enum insn_code icode = d->icode;
31221 bool last_arg_constant = false;
31222 const struct insn_data_d *insn_p = &insn_data[icode];
31223 enum machine_mode tmode = insn_p->operand[0].mode;
31224 enum { load, store } klass;
31226 switch ((enum ix86_builtin_func_type) d->flag)
31228 case VOID_FTYPE_VOID:
31229 emit_insn (GEN_FCN (icode) (target));
31230 return 0;
31231 case VOID_FTYPE_UINT64:
31232 case VOID_FTYPE_UNSIGNED:
31233 nargs = 0;
31234 klass = store;
31235 memory = 0;
31236 break;
31238 case INT_FTYPE_VOID:
31239 case UINT64_FTYPE_VOID:
31240 case UNSIGNED_FTYPE_VOID:
31241 nargs = 0;
31242 klass = load;
31243 memory = 0;
31244 break;
31245 case UINT64_FTYPE_PUNSIGNED:
31246 case V2DI_FTYPE_PV2DI:
31247 case V4DI_FTYPE_PV4DI:
31248 case V32QI_FTYPE_PCCHAR:
31249 case V16QI_FTYPE_PCCHAR:
31250 case V8SF_FTYPE_PCV4SF:
31251 case V8SF_FTYPE_PCFLOAT:
31252 case V4SF_FTYPE_PCFLOAT:
31253 case V4DF_FTYPE_PCV2DF:
31254 case V4DF_FTYPE_PCDOUBLE:
31255 case V2DF_FTYPE_PCDOUBLE:
31256 case VOID_FTYPE_PVOID:
31257 nargs = 1;
31258 klass = load;
31259 memory = 0;
31260 break;
31261 case VOID_FTYPE_PV2SF_V4SF:
31262 case VOID_FTYPE_PV4DI_V4DI:
31263 case VOID_FTYPE_PV2DI_V2DI:
31264 case VOID_FTYPE_PCHAR_V32QI:
31265 case VOID_FTYPE_PCHAR_V16QI:
31266 case VOID_FTYPE_PFLOAT_V8SF:
31267 case VOID_FTYPE_PFLOAT_V4SF:
31268 case VOID_FTYPE_PDOUBLE_V4DF:
31269 case VOID_FTYPE_PDOUBLE_V2DF:
31270 case VOID_FTYPE_PLONGLONG_LONGLONG:
31271 case VOID_FTYPE_PULONGLONG_ULONGLONG:
31272 case VOID_FTYPE_PINT_INT:
31273 nargs = 1;
31274 klass = store;
31275 /* Reserve memory operand for target. */
31276 memory = ARRAY_SIZE (args);
31277 break;
31278 case V4SF_FTYPE_V4SF_PCV2SF:
31279 case V2DF_FTYPE_V2DF_PCDOUBLE:
31280 nargs = 2;
31281 klass = load;
31282 memory = 1;
31283 break;
31284 case V8SF_FTYPE_PCV8SF_V8SI:
31285 case V4DF_FTYPE_PCV4DF_V4DI:
31286 case V4SF_FTYPE_PCV4SF_V4SI:
31287 case V2DF_FTYPE_PCV2DF_V2DI:
31288 case V8SI_FTYPE_PCV8SI_V8SI:
31289 case V4DI_FTYPE_PCV4DI_V4DI:
31290 case V4SI_FTYPE_PCV4SI_V4SI:
31291 case V2DI_FTYPE_PCV2DI_V2DI:
31292 nargs = 2;
31293 klass = load;
31294 memory = 0;
31295 break;
31296 case VOID_FTYPE_PV8SF_V8SI_V8SF:
31297 case VOID_FTYPE_PV4DF_V4DI_V4DF:
31298 case VOID_FTYPE_PV4SF_V4SI_V4SF:
31299 case VOID_FTYPE_PV2DF_V2DI_V2DF:
31300 case VOID_FTYPE_PV8SI_V8SI_V8SI:
31301 case VOID_FTYPE_PV4DI_V4DI_V4DI:
31302 case VOID_FTYPE_PV4SI_V4SI_V4SI:
31303 case VOID_FTYPE_PV2DI_V2DI_V2DI:
31304 nargs = 2;
31305 klass = store;
31306 /* Reserve memory operand for target. */
31307 memory = ARRAY_SIZE (args);
31308 break;
31309 case VOID_FTYPE_UINT_UINT_UINT:
31310 case VOID_FTYPE_UINT64_UINT_UINT:
31311 case UCHAR_FTYPE_UINT_UINT_UINT:
31312 case UCHAR_FTYPE_UINT64_UINT_UINT:
31313 nargs = 3;
31314 klass = load;
31315 memory = ARRAY_SIZE (args);
31316 last_arg_constant = true;
31317 break;
31318 default:
31319 gcc_unreachable ();
31322 gcc_assert (nargs <= ARRAY_SIZE (args));
31324 if (klass == store)
31326 arg = CALL_EXPR_ARG (exp, 0);
31327 op = expand_normal (arg);
31328 gcc_assert (target == 0);
31329 if (memory)
31331 op = force_reg (Pmode, convert_to_mode (Pmode, op, 1));
31332 target = gen_rtx_MEM (tmode, op);
31334 else
31335 target = force_reg (tmode, op);
31336 arg_adjust = 1;
31338 else
31340 arg_adjust = 0;
31341 if (optimize
31342 || target == 0
31343 || !register_operand (target, tmode)
31344 || GET_MODE (target) != tmode)
31345 target = gen_reg_rtx (tmode);
31348 for (i = 0; i < nargs; i++)
31350 enum machine_mode mode = insn_p->operand[i + 1].mode;
31351 bool match;
31353 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
31354 op = expand_normal (arg);
31355 match = insn_p->operand[i + 1].predicate (op, mode);
31357 if (last_arg_constant && (i + 1) == nargs)
31359 if (!match)
31361 if (icode == CODE_FOR_lwp_lwpvalsi3
31362 || icode == CODE_FOR_lwp_lwpinssi3
31363 || icode == CODE_FOR_lwp_lwpvaldi3
31364 || icode == CODE_FOR_lwp_lwpinsdi3)
31365 error ("the last argument must be a 32-bit immediate");
31366 else
31367 error ("the last argument must be an 8-bit immediate");
31368 return const0_rtx;
31371 else
31373 if (i == memory)
31375 /* This must be the memory operand. */
31376 op = force_reg (Pmode, convert_to_mode (Pmode, op, 1));
31377 op = gen_rtx_MEM (mode, op);
31378 gcc_assert (GET_MODE (op) == mode
31379 || GET_MODE (op) == VOIDmode);
31381 else
31383 /* This must be register. */
31384 if (VECTOR_MODE_P (mode))
31385 op = safe_vector_operand (op, mode);
31387 gcc_assert (GET_MODE (op) == mode
31388 || GET_MODE (op) == VOIDmode);
31389 op = copy_to_mode_reg (mode, op);
31393 args[i].op = op;
31394 args[i].mode = mode;
31397 switch (nargs)
31399 case 0:
31400 pat = GEN_FCN (icode) (target);
31401 break;
31402 case 1:
31403 pat = GEN_FCN (icode) (target, args[0].op);
31404 break;
31405 case 2:
31406 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
31407 break;
31408 case 3:
31409 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
31410 break;
31411 default:
31412 gcc_unreachable ();
31415 if (! pat)
31416 return 0;
31417 emit_insn (pat);
31418 return klass == store ? 0 : target;
31421 /* Return the integer constant in ARG. Constrain it to be in the range
31422 of the subparts of VEC_TYPE; issue an error if not. */
31424 static int
31425 get_element_number (tree vec_type, tree arg)
31427 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
31429 if (!host_integerp (arg, 1)
31430 || (elt = tree_low_cst (arg, 1), elt > max))
31432 error ("selector must be an integer constant in the range 0..%wi", max);
31433 return 0;
31436 return elt;
31439 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
31440 ix86_expand_vector_init. We DO have language-level syntax for this, in
31441 the form of (type){ init-list }. Except that since we can't place emms
31442 instructions from inside the compiler, we can't allow the use of MMX
31443 registers unless the user explicitly asks for it. So we do *not* define
31444 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
31445 we have builtins invoked by mmintrin.h that gives us license to emit
31446 these sorts of instructions. */
31448 static rtx
31449 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
31451 enum machine_mode tmode = TYPE_MODE (type);
31452 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
31453 int i, n_elt = GET_MODE_NUNITS (tmode);
31454 rtvec v = rtvec_alloc (n_elt);
31456 gcc_assert (VECTOR_MODE_P (tmode));
31457 gcc_assert (call_expr_nargs (exp) == n_elt);
31459 for (i = 0; i < n_elt; ++i)
31461 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
31462 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
31465 if (!target || !register_operand (target, tmode))
31466 target = gen_reg_rtx (tmode);
31468 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
31469 return target;
31472 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
31473 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
31474 had a language-level syntax for referencing vector elements. */
31476 static rtx
31477 ix86_expand_vec_ext_builtin (tree exp, rtx target)
31479 enum machine_mode tmode, mode0;
31480 tree arg0, arg1;
31481 int elt;
31482 rtx op0;
31484 arg0 = CALL_EXPR_ARG (exp, 0);
31485 arg1 = CALL_EXPR_ARG (exp, 1);
31487 op0 = expand_normal (arg0);
31488 elt = get_element_number (TREE_TYPE (arg0), arg1);
31490 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
31491 mode0 = TYPE_MODE (TREE_TYPE (arg0));
31492 gcc_assert (VECTOR_MODE_P (mode0));
31494 op0 = force_reg (mode0, op0);
31496 if (optimize || !target || !register_operand (target, tmode))
31497 target = gen_reg_rtx (tmode);
31499 ix86_expand_vector_extract (true, target, op0, elt);
31501 return target;
31504 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
31505 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
31506 a language-level syntax for referencing vector elements. */
31508 static rtx
31509 ix86_expand_vec_set_builtin (tree exp)
31511 enum machine_mode tmode, mode1;
31512 tree arg0, arg1, arg2;
31513 int elt;
31514 rtx op0, op1, target;
31516 arg0 = CALL_EXPR_ARG (exp, 0);
31517 arg1 = CALL_EXPR_ARG (exp, 1);
31518 arg2 = CALL_EXPR_ARG (exp, 2);
31520 tmode = TYPE_MODE (TREE_TYPE (arg0));
31521 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
31522 gcc_assert (VECTOR_MODE_P (tmode));
31524 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
31525 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
31526 elt = get_element_number (TREE_TYPE (arg0), arg2);
31528 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
31529 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
31531 op0 = force_reg (tmode, op0);
31532 op1 = force_reg (mode1, op1);
31534 /* OP0 is the source of these builtin functions and shouldn't be
31535 modified. Create a copy, use it and return it as target. */
31536 target = gen_reg_rtx (tmode);
31537 emit_move_insn (target, op0);
31538 ix86_expand_vector_set (true, target, op1, elt);
31540 return target;
31543 /* Expand an expression EXP that calls a built-in function,
31544 with result going to TARGET if that's convenient
31545 (and in mode MODE if that's convenient).
31546 SUBTARGET may be used as the target for computing one of EXP's operands.
31547 IGNORE is nonzero if the value is to be ignored. */
31549 static rtx
31550 ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
31551 enum machine_mode mode ATTRIBUTE_UNUSED,
31552 int ignore ATTRIBUTE_UNUSED)
31554 const struct builtin_description *d;
31555 size_t i;
31556 enum insn_code icode;
31557 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
31558 tree arg0, arg1, arg2, arg3, arg4;
31559 rtx op0, op1, op2, op3, op4, pat, insn;
31560 enum machine_mode mode0, mode1, mode2, mode3, mode4;
31561 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
31563 /* For CPU builtins that can be folded, fold first and expand the fold. */
31564 switch (fcode)
31566 case IX86_BUILTIN_CPU_INIT:
31568 /* Make it call __cpu_indicator_init in libgcc. */
31569 tree call_expr, fndecl, type;
31570 type = build_function_type_list (integer_type_node, NULL_TREE);
31571 fndecl = build_fn_decl ("__cpu_indicator_init", type);
31572 call_expr = build_call_expr (fndecl, 0);
31573 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
31575 case IX86_BUILTIN_CPU_IS:
31576 case IX86_BUILTIN_CPU_SUPPORTS:
31578 tree arg0 = CALL_EXPR_ARG (exp, 0);
31579 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
31580 gcc_assert (fold_expr != NULL_TREE);
31581 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
31585 /* Determine whether the builtin function is available under the current ISA.
31586 Originally the builtin was not created if it wasn't applicable to the
31587 current ISA based on the command line switches. With function specific
31588 options, we need to check in the context of the function making the call
31589 whether it is supported. */
31590 if (ix86_builtins_isa[fcode].isa
31591 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
31593 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
31594 NULL, (enum fpmath_unit) 0, false);
31596 if (!opts)
31597 error ("%qE needs unknown isa option", fndecl);
31598 else
31600 gcc_assert (opts != NULL);
31601 error ("%qE needs isa option %s", fndecl, opts);
31602 free (opts);
31604 return const0_rtx;
31607 switch (fcode)
31609 case IX86_BUILTIN_MASKMOVQ:
31610 case IX86_BUILTIN_MASKMOVDQU:
31611 icode = (fcode == IX86_BUILTIN_MASKMOVQ
31612 ? CODE_FOR_mmx_maskmovq
31613 : CODE_FOR_sse2_maskmovdqu);
31614 /* Note the arg order is different from the operand order. */
31615 arg1 = CALL_EXPR_ARG (exp, 0);
31616 arg2 = CALL_EXPR_ARG (exp, 1);
31617 arg0 = CALL_EXPR_ARG (exp, 2);
31618 op0 = expand_normal (arg0);
31619 op1 = expand_normal (arg1);
31620 op2 = expand_normal (arg2);
31621 mode0 = insn_data[icode].operand[0].mode;
31622 mode1 = insn_data[icode].operand[1].mode;
31623 mode2 = insn_data[icode].operand[2].mode;
31625 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
31626 op0 = gen_rtx_MEM (mode1, op0);
31628 if (!insn_data[icode].operand[0].predicate (op0, mode0))
31629 op0 = copy_to_mode_reg (mode0, op0);
31630 if (!insn_data[icode].operand[1].predicate (op1, mode1))
31631 op1 = copy_to_mode_reg (mode1, op1);
31632 if (!insn_data[icode].operand[2].predicate (op2, mode2))
31633 op2 = copy_to_mode_reg (mode2, op2);
31634 pat = GEN_FCN (icode) (op0, op1, op2);
31635 if (! pat)
31636 return 0;
31637 emit_insn (pat);
31638 return 0;
31640 case IX86_BUILTIN_LDMXCSR:
31641 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
31642 target = assign_386_stack_local (SImode, SLOT_TEMP);
31643 emit_move_insn (target, op0);
31644 emit_insn (gen_sse_ldmxcsr (target));
31645 return 0;
31647 case IX86_BUILTIN_STMXCSR:
31648 target = assign_386_stack_local (SImode, SLOT_TEMP);
31649 emit_insn (gen_sse_stmxcsr (target));
31650 return copy_to_mode_reg (SImode, target);
31652 case IX86_BUILTIN_CLFLUSH:
31653 arg0 = CALL_EXPR_ARG (exp, 0);
31654 op0 = expand_normal (arg0);
31655 icode = CODE_FOR_sse2_clflush;
31656 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
31657 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
31659 emit_insn (gen_sse2_clflush (op0));
31660 return 0;
31662 case IX86_BUILTIN_MONITOR:
31663 arg0 = CALL_EXPR_ARG (exp, 0);
31664 arg1 = CALL_EXPR_ARG (exp, 1);
31665 arg2 = CALL_EXPR_ARG (exp, 2);
31666 op0 = expand_normal (arg0);
31667 op1 = expand_normal (arg1);
31668 op2 = expand_normal (arg2);
31669 if (!REG_P (op0))
31670 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
31671 if (!REG_P (op1))
31672 op1 = copy_to_mode_reg (SImode, op1);
31673 if (!REG_P (op2))
31674 op2 = copy_to_mode_reg (SImode, op2);
31675 emit_insn (ix86_gen_monitor (op0, op1, op2));
31676 return 0;
31678 case IX86_BUILTIN_MWAIT:
31679 arg0 = CALL_EXPR_ARG (exp, 0);
31680 arg1 = CALL_EXPR_ARG (exp, 1);
31681 op0 = expand_normal (arg0);
31682 op1 = expand_normal (arg1);
31683 if (!REG_P (op0))
31684 op0 = copy_to_mode_reg (SImode, op0);
31685 if (!REG_P (op1))
31686 op1 = copy_to_mode_reg (SImode, op1);
31687 emit_insn (gen_sse3_mwait (op0, op1));
31688 return 0;
31690 case IX86_BUILTIN_VEC_INIT_V2SI:
31691 case IX86_BUILTIN_VEC_INIT_V4HI:
31692 case IX86_BUILTIN_VEC_INIT_V8QI:
31693 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
31695 case IX86_BUILTIN_VEC_EXT_V2DF:
31696 case IX86_BUILTIN_VEC_EXT_V2DI:
31697 case IX86_BUILTIN_VEC_EXT_V4SF:
31698 case IX86_BUILTIN_VEC_EXT_V4SI:
31699 case IX86_BUILTIN_VEC_EXT_V8HI:
31700 case IX86_BUILTIN_VEC_EXT_V2SI:
31701 case IX86_BUILTIN_VEC_EXT_V4HI:
31702 case IX86_BUILTIN_VEC_EXT_V16QI:
31703 return ix86_expand_vec_ext_builtin (exp, target);
31705 case IX86_BUILTIN_VEC_SET_V2DI:
31706 case IX86_BUILTIN_VEC_SET_V4SF:
31707 case IX86_BUILTIN_VEC_SET_V4SI:
31708 case IX86_BUILTIN_VEC_SET_V8HI:
31709 case IX86_BUILTIN_VEC_SET_V4HI:
31710 case IX86_BUILTIN_VEC_SET_V16QI:
31711 return ix86_expand_vec_set_builtin (exp);
31713 case IX86_BUILTIN_INFQ:
31714 case IX86_BUILTIN_HUGE_VALQ:
31716 REAL_VALUE_TYPE inf;
31717 rtx tmp;
31719 real_inf (&inf);
31720 tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
31722 tmp = validize_mem (force_const_mem (mode, tmp));
31724 if (target == 0)
31725 target = gen_reg_rtx (mode);
31727 emit_move_insn (target, tmp);
31728 return target;
31731 case IX86_BUILTIN_RDPMC:
31732 case IX86_BUILTIN_RDTSC:
31733 case IX86_BUILTIN_RDTSCP:
31735 op0 = gen_reg_rtx (DImode);
31736 op1 = gen_reg_rtx (DImode);
31738 if (fcode == IX86_BUILTIN_RDPMC)
31740 arg0 = CALL_EXPR_ARG (exp, 0);
31741 op2 = expand_normal (arg0);
31742 if (!register_operand (op2, SImode))
31743 op2 = copy_to_mode_reg (SImode, op2);
31745 insn = (TARGET_64BIT
31746 ? gen_rdpmc_rex64 (op0, op1, op2)
31747 : gen_rdpmc (op0, op2));
31748 emit_insn (insn);
31750 else if (fcode == IX86_BUILTIN_RDTSC)
31752 insn = (TARGET_64BIT
31753 ? gen_rdtsc_rex64 (op0, op1)
31754 : gen_rdtsc (op0));
31755 emit_insn (insn);
31757 else
31759 op2 = gen_reg_rtx (SImode);
31761 insn = (TARGET_64BIT
31762 ? gen_rdtscp_rex64 (op0, op1, op2)
31763 : gen_rdtscp (op0, op2));
31764 emit_insn (insn);
31766 arg0 = CALL_EXPR_ARG (exp, 0);
31767 op4 = expand_normal (arg0);
31768 if (!address_operand (op4, VOIDmode))
31770 op4 = convert_memory_address (Pmode, op4);
31771 op4 = copy_addr_to_reg (op4);
31773 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
31776 if (target == 0)
31777 target = gen_reg_rtx (mode);
31779 if (TARGET_64BIT)
31781 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
31782 op1, 1, OPTAB_DIRECT);
31783 op0 = expand_simple_binop (DImode, IOR, op0, op1,
31784 op0, 1, OPTAB_DIRECT);
31787 emit_move_insn (target, op0);
31788 return target;
31790 case IX86_BUILTIN_FXSAVE:
31791 case IX86_BUILTIN_FXRSTOR:
31792 case IX86_BUILTIN_FXSAVE64:
31793 case IX86_BUILTIN_FXRSTOR64:
31794 switch (fcode)
31796 case IX86_BUILTIN_FXSAVE:
31797 icode = CODE_FOR_fxsave;
31798 break;
31799 case IX86_BUILTIN_FXRSTOR:
31800 icode = CODE_FOR_fxrstor;
31801 break;
31802 case IX86_BUILTIN_FXSAVE64:
31803 icode = CODE_FOR_fxsave64;
31804 break;
31805 case IX86_BUILTIN_FXRSTOR64:
31806 icode = CODE_FOR_fxrstor64;
31807 break;
31808 default:
31809 gcc_unreachable ();
31812 arg0 = CALL_EXPR_ARG (exp, 0);
31813 op0 = expand_normal (arg0);
31815 if (!address_operand (op0, VOIDmode))
31817 op0 = convert_memory_address (Pmode, op0);
31818 op0 = copy_addr_to_reg (op0);
31820 op0 = gen_rtx_MEM (BLKmode, op0);
31822 pat = GEN_FCN (icode) (op0);
31823 if (pat)
31824 emit_insn (pat);
31825 return 0;
31827 case IX86_BUILTIN_XSAVE:
31828 case IX86_BUILTIN_XRSTOR:
31829 case IX86_BUILTIN_XSAVE64:
31830 case IX86_BUILTIN_XRSTOR64:
31831 case IX86_BUILTIN_XSAVEOPT:
31832 case IX86_BUILTIN_XSAVEOPT64:
31833 arg0 = CALL_EXPR_ARG (exp, 0);
31834 arg1 = CALL_EXPR_ARG (exp, 1);
31835 op0 = expand_normal (arg0);
31836 op1 = expand_normal (arg1);
31838 if (!address_operand (op0, VOIDmode))
31840 op0 = convert_memory_address (Pmode, op0);
31841 op0 = copy_addr_to_reg (op0);
31843 op0 = gen_rtx_MEM (BLKmode, op0);
31845 op1 = force_reg (DImode, op1);
31847 if (TARGET_64BIT)
31849 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
31850 NULL, 1, OPTAB_DIRECT);
31851 switch (fcode)
31853 case IX86_BUILTIN_XSAVE:
31854 icode = CODE_FOR_xsave_rex64;
31855 break;
31856 case IX86_BUILTIN_XRSTOR:
31857 icode = CODE_FOR_xrstor_rex64;
31858 break;
31859 case IX86_BUILTIN_XSAVE64:
31860 icode = CODE_FOR_xsave64;
31861 break;
31862 case IX86_BUILTIN_XRSTOR64:
31863 icode = CODE_FOR_xrstor64;
31864 break;
31865 case IX86_BUILTIN_XSAVEOPT:
31866 icode = CODE_FOR_xsaveopt_rex64;
31867 break;
31868 case IX86_BUILTIN_XSAVEOPT64:
31869 icode = CODE_FOR_xsaveopt64;
31870 break;
31871 default:
31872 gcc_unreachable ();
31875 op2 = gen_lowpart (SImode, op2);
31876 op1 = gen_lowpart (SImode, op1);
31877 pat = GEN_FCN (icode) (op0, op1, op2);
31879 else
31881 switch (fcode)
31883 case IX86_BUILTIN_XSAVE:
31884 icode = CODE_FOR_xsave;
31885 break;
31886 case IX86_BUILTIN_XRSTOR:
31887 icode = CODE_FOR_xrstor;
31888 break;
31889 case IX86_BUILTIN_XSAVEOPT:
31890 icode = CODE_FOR_xsaveopt;
31891 break;
31892 default:
31893 gcc_unreachable ();
31895 pat = GEN_FCN (icode) (op0, op1);
31898 if (pat)
31899 emit_insn (pat);
31900 return 0;
31902 case IX86_BUILTIN_LLWPCB:
31903 arg0 = CALL_EXPR_ARG (exp, 0);
31904 op0 = expand_normal (arg0);
31905 icode = CODE_FOR_lwp_llwpcb;
31906 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
31907 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
31908 emit_insn (gen_lwp_llwpcb (op0));
31909 return 0;
31911 case IX86_BUILTIN_SLWPCB:
31912 icode = CODE_FOR_lwp_slwpcb;
31913 if (!target
31914 || !insn_data[icode].operand[0].predicate (target, Pmode))
31915 target = gen_reg_rtx (Pmode);
31916 emit_insn (gen_lwp_slwpcb (target));
31917 return target;
31919 case IX86_BUILTIN_BEXTRI32:
31920 case IX86_BUILTIN_BEXTRI64:
31921 arg0 = CALL_EXPR_ARG (exp, 0);
31922 arg1 = CALL_EXPR_ARG (exp, 1);
31923 op0 = expand_normal (arg0);
31924 op1 = expand_normal (arg1);
31925 icode = (fcode == IX86_BUILTIN_BEXTRI32
31926 ? CODE_FOR_tbm_bextri_si
31927 : CODE_FOR_tbm_bextri_di);
31928 if (!CONST_INT_P (op1))
31930 error ("last argument must be an immediate");
31931 return const0_rtx;
31933 else
31935 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
31936 unsigned char lsb_index = INTVAL (op1) & 0xFF;
31937 op1 = GEN_INT (length);
31938 op2 = GEN_INT (lsb_index);
31939 pat = GEN_FCN (icode) (target, op0, op1, op2);
31940 if (pat)
31941 emit_insn (pat);
31942 return target;
31945 case IX86_BUILTIN_RDRAND16_STEP:
31946 icode = CODE_FOR_rdrandhi_1;
31947 mode0 = HImode;
31948 goto rdrand_step;
31950 case IX86_BUILTIN_RDRAND32_STEP:
31951 icode = CODE_FOR_rdrandsi_1;
31952 mode0 = SImode;
31953 goto rdrand_step;
31955 case IX86_BUILTIN_RDRAND64_STEP:
31956 icode = CODE_FOR_rdranddi_1;
31957 mode0 = DImode;
31959 rdrand_step:
31960 op0 = gen_reg_rtx (mode0);
31961 emit_insn (GEN_FCN (icode) (op0));
31963 arg0 = CALL_EXPR_ARG (exp, 0);
31964 op1 = expand_normal (arg0);
31965 if (!address_operand (op1, VOIDmode))
31967 op1 = convert_memory_address (Pmode, op1);
31968 op1 = copy_addr_to_reg (op1);
31970 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
31972 op1 = gen_reg_rtx (SImode);
31973 emit_move_insn (op1, CONST1_RTX (SImode));
31975 /* Emit SImode conditional move. */
31976 if (mode0 == HImode)
31978 op2 = gen_reg_rtx (SImode);
31979 emit_insn (gen_zero_extendhisi2 (op2, op0));
31981 else if (mode0 == SImode)
31982 op2 = op0;
31983 else
31984 op2 = gen_rtx_SUBREG (SImode, op0, 0);
31986 if (target == 0)
31987 target = gen_reg_rtx (SImode);
31989 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
31990 const0_rtx);
31991 emit_insn (gen_rtx_SET (VOIDmode, target,
31992 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
31993 return target;
31995 case IX86_BUILTIN_RDSEED16_STEP:
31996 icode = CODE_FOR_rdseedhi_1;
31997 mode0 = HImode;
31998 goto rdseed_step;
32000 case IX86_BUILTIN_RDSEED32_STEP:
32001 icode = CODE_FOR_rdseedsi_1;
32002 mode0 = SImode;
32003 goto rdseed_step;
32005 case IX86_BUILTIN_RDSEED64_STEP:
32006 icode = CODE_FOR_rdseeddi_1;
32007 mode0 = DImode;
32009 rdseed_step:
32010 op0 = gen_reg_rtx (mode0);
32011 emit_insn (GEN_FCN (icode) (op0));
32013 arg0 = CALL_EXPR_ARG (exp, 0);
32014 op1 = expand_normal (arg0);
32015 if (!address_operand (op1, VOIDmode))
32017 op1 = convert_memory_address (Pmode, op1);
32018 op1 = copy_addr_to_reg (op1);
32020 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
32022 op2 = gen_reg_rtx (QImode);
32024 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
32025 const0_rtx);
32026 emit_insn (gen_rtx_SET (VOIDmode, op2, pat));
32028 if (target == 0)
32029 target = gen_reg_rtx (SImode);
32031 emit_insn (gen_zero_extendqisi2 (target, op2));
32032 return target;
32034 case IX86_BUILTIN_ADDCARRYX32:
32035 icode = TARGET_ADX ? CODE_FOR_adcxsi3 : CODE_FOR_addsi3_carry;
32036 mode0 = SImode;
32037 goto addcarryx;
32039 case IX86_BUILTIN_ADDCARRYX64:
32040 icode = TARGET_ADX ? CODE_FOR_adcxdi3 : CODE_FOR_adddi3_carry;
32041 mode0 = DImode;
32043 addcarryx:
32044 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
32045 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
32046 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
32047 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
32049 op0 = gen_reg_rtx (QImode);
32051 /* Generate CF from input operand. */
32052 op1 = expand_normal (arg0);
32053 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
32054 emit_insn (gen_addqi3_cc (op0, op1, constm1_rtx));
32056 /* Gen ADCX instruction to compute X+Y+CF. */
32057 op2 = expand_normal (arg1);
32058 op3 = expand_normal (arg2);
32060 if (!REG_P (op2))
32061 op2 = copy_to_mode_reg (mode0, op2);
32062 if (!REG_P (op3))
32063 op3 = copy_to_mode_reg (mode0, op3);
32065 op0 = gen_reg_rtx (mode0);
32067 op4 = gen_rtx_REG (CCCmode, FLAGS_REG);
32068 pat = gen_rtx_LTU (VOIDmode, op4, const0_rtx);
32069 emit_insn (GEN_FCN (icode) (op0, op2, op3, op4, pat));
32071 /* Store the result. */
32072 op4 = expand_normal (arg3);
32073 if (!address_operand (op4, VOIDmode))
32075 op4 = convert_memory_address (Pmode, op4);
32076 op4 = copy_addr_to_reg (op4);
32078 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
32080 /* Return current CF value. */
32081 if (target == 0)
32082 target = gen_reg_rtx (QImode);
32084 PUT_MODE (pat, QImode);
32085 emit_insn (gen_rtx_SET (VOIDmode, target, pat));
32086 return target;
32088 case IX86_BUILTIN_GATHERSIV2DF:
32089 icode = CODE_FOR_avx2_gathersiv2df;
32090 goto gather_gen;
32091 case IX86_BUILTIN_GATHERSIV4DF:
32092 icode = CODE_FOR_avx2_gathersiv4df;
32093 goto gather_gen;
32094 case IX86_BUILTIN_GATHERDIV2DF:
32095 icode = CODE_FOR_avx2_gatherdiv2df;
32096 goto gather_gen;
32097 case IX86_BUILTIN_GATHERDIV4DF:
32098 icode = CODE_FOR_avx2_gatherdiv4df;
32099 goto gather_gen;
32100 case IX86_BUILTIN_GATHERSIV4SF:
32101 icode = CODE_FOR_avx2_gathersiv4sf;
32102 goto gather_gen;
32103 case IX86_BUILTIN_GATHERSIV8SF:
32104 icode = CODE_FOR_avx2_gathersiv8sf;
32105 goto gather_gen;
32106 case IX86_BUILTIN_GATHERDIV4SF:
32107 icode = CODE_FOR_avx2_gatherdiv4sf;
32108 goto gather_gen;
32109 case IX86_BUILTIN_GATHERDIV8SF:
32110 icode = CODE_FOR_avx2_gatherdiv8sf;
32111 goto gather_gen;
32112 case IX86_BUILTIN_GATHERSIV2DI:
32113 icode = CODE_FOR_avx2_gathersiv2di;
32114 goto gather_gen;
32115 case IX86_BUILTIN_GATHERSIV4DI:
32116 icode = CODE_FOR_avx2_gathersiv4di;
32117 goto gather_gen;
32118 case IX86_BUILTIN_GATHERDIV2DI:
32119 icode = CODE_FOR_avx2_gatherdiv2di;
32120 goto gather_gen;
32121 case IX86_BUILTIN_GATHERDIV4DI:
32122 icode = CODE_FOR_avx2_gatherdiv4di;
32123 goto gather_gen;
32124 case IX86_BUILTIN_GATHERSIV4SI:
32125 icode = CODE_FOR_avx2_gathersiv4si;
32126 goto gather_gen;
32127 case IX86_BUILTIN_GATHERSIV8SI:
32128 icode = CODE_FOR_avx2_gathersiv8si;
32129 goto gather_gen;
32130 case IX86_BUILTIN_GATHERDIV4SI:
32131 icode = CODE_FOR_avx2_gatherdiv4si;
32132 goto gather_gen;
32133 case IX86_BUILTIN_GATHERDIV8SI:
32134 icode = CODE_FOR_avx2_gatherdiv8si;
32135 goto gather_gen;
32136 case IX86_BUILTIN_GATHERALTSIV4DF:
32137 icode = CODE_FOR_avx2_gathersiv4df;
32138 goto gather_gen;
32139 case IX86_BUILTIN_GATHERALTDIV8SF:
32140 icode = CODE_FOR_avx2_gatherdiv8sf;
32141 goto gather_gen;
32142 case IX86_BUILTIN_GATHERALTSIV4DI:
32143 icode = CODE_FOR_avx2_gathersiv4di;
32144 goto gather_gen;
32145 case IX86_BUILTIN_GATHERALTDIV8SI:
32146 icode = CODE_FOR_avx2_gatherdiv8si;
32147 goto gather_gen;
32149 gather_gen:
32150 arg0 = CALL_EXPR_ARG (exp, 0);
32151 arg1 = CALL_EXPR_ARG (exp, 1);
32152 arg2 = CALL_EXPR_ARG (exp, 2);
32153 arg3 = CALL_EXPR_ARG (exp, 3);
32154 arg4 = CALL_EXPR_ARG (exp, 4);
32155 op0 = expand_normal (arg0);
32156 op1 = expand_normal (arg1);
32157 op2 = expand_normal (arg2);
32158 op3 = expand_normal (arg3);
32159 op4 = expand_normal (arg4);
32160 /* Note the arg order is different from the operand order. */
32161 mode0 = insn_data[icode].operand[1].mode;
32162 mode2 = insn_data[icode].operand[3].mode;
32163 mode3 = insn_data[icode].operand[4].mode;
32164 mode4 = insn_data[icode].operand[5].mode;
32166 if (target == NULL_RTX
32167 || GET_MODE (target) != insn_data[icode].operand[0].mode)
32168 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
32169 else
32170 subtarget = target;
32172 if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
32173 || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
32175 rtx half = gen_reg_rtx (V4SImode);
32176 if (!nonimmediate_operand (op2, V8SImode))
32177 op2 = copy_to_mode_reg (V8SImode, op2);
32178 emit_insn (gen_vec_extract_lo_v8si (half, op2));
32179 op2 = half;
32181 else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
32182 || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
32184 rtx (*gen) (rtx, rtx);
32185 rtx half = gen_reg_rtx (mode0);
32186 if (mode0 == V4SFmode)
32187 gen = gen_vec_extract_lo_v8sf;
32188 else
32189 gen = gen_vec_extract_lo_v8si;
32190 if (!nonimmediate_operand (op0, GET_MODE (op0)))
32191 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
32192 emit_insn (gen (half, op0));
32193 op0 = half;
32194 if (!nonimmediate_operand (op3, GET_MODE (op3)))
32195 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
32196 emit_insn (gen (half, op3));
32197 op3 = half;
32200 /* Force memory operand only with base register here. But we
32201 don't want to do it on memory operand for other builtin
32202 functions. */
32203 op1 = force_reg (Pmode, convert_to_mode (Pmode, op1, 1));
32205 if (!insn_data[icode].operand[1].predicate (op0, mode0))
32206 op0 = copy_to_mode_reg (mode0, op0);
32207 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
32208 op1 = copy_to_mode_reg (Pmode, op1);
32209 if (!insn_data[icode].operand[3].predicate (op2, mode2))
32210 op2 = copy_to_mode_reg (mode2, op2);
32211 if (!insn_data[icode].operand[4].predicate (op3, mode3))
32212 op3 = copy_to_mode_reg (mode3, op3);
32213 if (!insn_data[icode].operand[5].predicate (op4, mode4))
32215 error ("last argument must be scale 1, 2, 4, 8");
32216 return const0_rtx;
32219 /* Optimize. If mask is known to have all high bits set,
32220 replace op0 with pc_rtx to signal that the instruction
32221 overwrites the whole destination and doesn't use its
32222 previous contents. */
32223 if (optimize)
32225 if (TREE_CODE (arg3) == VECTOR_CST)
32227 unsigned int negative = 0;
32228 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
32230 tree cst = VECTOR_CST_ELT (arg3, i);
32231 if (TREE_CODE (cst) == INTEGER_CST
32232 && tree_int_cst_sign_bit (cst))
32233 negative++;
32234 else if (TREE_CODE (cst) == REAL_CST
32235 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
32236 negative++;
32238 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
32239 op0 = pc_rtx;
32241 else if (TREE_CODE (arg3) == SSA_NAME)
32243 /* Recognize also when mask is like:
32244 __v2df src = _mm_setzero_pd ();
32245 __v2df mask = _mm_cmpeq_pd (src, src);
32247 __v8sf src = _mm256_setzero_ps ();
32248 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
32249 as that is a cheaper way to load all ones into
32250 a register than having to load a constant from
32251 memory. */
32252 gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
32253 if (is_gimple_call (def_stmt))
32255 tree fndecl = gimple_call_fndecl (def_stmt);
32256 if (fndecl
32257 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
32258 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
32260 case IX86_BUILTIN_CMPPD:
32261 case IX86_BUILTIN_CMPPS:
32262 case IX86_BUILTIN_CMPPD256:
32263 case IX86_BUILTIN_CMPPS256:
32264 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
32265 break;
32266 /* FALLTHRU */
32267 case IX86_BUILTIN_CMPEQPD:
32268 case IX86_BUILTIN_CMPEQPS:
32269 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
32270 && initializer_zerop (gimple_call_arg (def_stmt,
32271 1)))
32272 op0 = pc_rtx;
32273 break;
32274 default:
32275 break;
32281 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
32282 if (! pat)
32283 return const0_rtx;
32284 emit_insn (pat);
32286 if (fcode == IX86_BUILTIN_GATHERDIV8SF
32287 || fcode == IX86_BUILTIN_GATHERDIV8SI)
32289 enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
32290 ? V4SFmode : V4SImode;
32291 if (target == NULL_RTX)
32292 target = gen_reg_rtx (tmode);
32293 if (tmode == V4SFmode)
32294 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
32295 else
32296 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
32298 else
32299 target = subtarget;
32301 return target;
32303 case IX86_BUILTIN_XABORT:
32304 icode = CODE_FOR_xabort;
32305 arg0 = CALL_EXPR_ARG (exp, 0);
32306 op0 = expand_normal (arg0);
32307 mode0 = insn_data[icode].operand[0].mode;
32308 if (!insn_data[icode].operand[0].predicate (op0, mode0))
32310 error ("the xabort's argument must be an 8-bit immediate");
32311 return const0_rtx;
32313 emit_insn (gen_xabort (op0));
32314 return 0;
32316 default:
32317 break;
32320 for (i = 0, d = bdesc_special_args;
32321 i < ARRAY_SIZE (bdesc_special_args);
32322 i++, d++)
32323 if (d->code == fcode)
32324 return ix86_expand_special_args_builtin (d, exp, target);
32326 for (i = 0, d = bdesc_args;
32327 i < ARRAY_SIZE (bdesc_args);
32328 i++, d++)
32329 if (d->code == fcode)
32330 switch (fcode)
32332 case IX86_BUILTIN_FABSQ:
32333 case IX86_BUILTIN_COPYSIGNQ:
32334 if (!TARGET_SSE)
32335 /* Emit a normal call if SSE isn't available. */
32336 return expand_call (exp, target, ignore);
32337 default:
32338 return ix86_expand_args_builtin (d, exp, target);
32341 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
32342 if (d->code == fcode)
32343 return ix86_expand_sse_comi (d, exp, target);
32345 for (i = 0, d = bdesc_pcmpestr;
32346 i < ARRAY_SIZE (bdesc_pcmpestr);
32347 i++, d++)
32348 if (d->code == fcode)
32349 return ix86_expand_sse_pcmpestr (d, exp, target);
32351 for (i = 0, d = bdesc_pcmpistr;
32352 i < ARRAY_SIZE (bdesc_pcmpistr);
32353 i++, d++)
32354 if (d->code == fcode)
32355 return ix86_expand_sse_pcmpistr (d, exp, target);
32357 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
32358 if (d->code == fcode)
32359 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
32360 (enum ix86_builtin_func_type)
32361 d->flag, d->comparison);
32363 gcc_unreachable ();
32366 /* Returns a function decl for a vectorized version of the builtin function
32367 with builtin function code FN and the result vector type TYPE, or NULL_TREE
32368 if it is not available. */
32370 static tree
32371 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
32372 tree type_in)
32374 enum machine_mode in_mode, out_mode;
32375 int in_n, out_n;
32376 enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
32378 if (TREE_CODE (type_out) != VECTOR_TYPE
32379 || TREE_CODE (type_in) != VECTOR_TYPE
32380 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
32381 return NULL_TREE;
32383 out_mode = TYPE_MODE (TREE_TYPE (type_out));
32384 out_n = TYPE_VECTOR_SUBPARTS (type_out);
32385 in_mode = TYPE_MODE (TREE_TYPE (type_in));
32386 in_n = TYPE_VECTOR_SUBPARTS (type_in);
32388 switch (fn)
32390 case BUILT_IN_SQRT:
32391 if (out_mode == DFmode && in_mode == DFmode)
32393 if (out_n == 2 && in_n == 2)
32394 return ix86_builtins[IX86_BUILTIN_SQRTPD];
32395 else if (out_n == 4 && in_n == 4)
32396 return ix86_builtins[IX86_BUILTIN_SQRTPD256];
32398 break;
32400 case BUILT_IN_SQRTF:
32401 if (out_mode == SFmode && in_mode == SFmode)
32403 if (out_n == 4 && in_n == 4)
32404 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
32405 else if (out_n == 8 && in_n == 8)
32406 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
32408 break;
32410 case BUILT_IN_IFLOOR:
32411 case BUILT_IN_LFLOOR:
32412 case BUILT_IN_LLFLOOR:
32413 /* The round insn does not trap on denormals. */
32414 if (flag_trapping_math || !TARGET_ROUND)
32415 break;
32417 if (out_mode == SImode && in_mode == DFmode)
32419 if (out_n == 4 && in_n == 2)
32420 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX];
32421 else if (out_n == 8 && in_n == 4)
32422 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256];
32424 break;
32426 case BUILT_IN_IFLOORF:
32427 case BUILT_IN_LFLOORF:
32428 case BUILT_IN_LLFLOORF:
32429 /* The round insn does not trap on denormals. */
32430 if (flag_trapping_math || !TARGET_ROUND)
32431 break;
32433 if (out_mode == SImode && in_mode == SFmode)
32435 if (out_n == 4 && in_n == 4)
32436 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX];
32437 else if (out_n == 8 && in_n == 8)
32438 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX256];
32440 break;
32442 case BUILT_IN_ICEIL:
32443 case BUILT_IN_LCEIL:
32444 case BUILT_IN_LLCEIL:
32445 /* The round insn does not trap on denormals. */
32446 if (flag_trapping_math || !TARGET_ROUND)
32447 break;
32449 if (out_mode == SImode && in_mode == DFmode)
32451 if (out_n == 4 && in_n == 2)
32452 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX];
32453 else if (out_n == 8 && in_n == 4)
32454 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256];
32456 break;
32458 case BUILT_IN_ICEILF:
32459 case BUILT_IN_LCEILF:
32460 case BUILT_IN_LLCEILF:
32461 /* The round insn does not trap on denormals. */
32462 if (flag_trapping_math || !TARGET_ROUND)
32463 break;
32465 if (out_mode == SImode && in_mode == SFmode)
32467 if (out_n == 4 && in_n == 4)
32468 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX];
32469 else if (out_n == 8 && in_n == 8)
32470 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX256];
32472 break;
32474 case BUILT_IN_IRINT:
32475 case BUILT_IN_LRINT:
32476 case BUILT_IN_LLRINT:
32477 if (out_mode == SImode && in_mode == DFmode)
32479 if (out_n == 4 && in_n == 2)
32480 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
32481 else if (out_n == 8 && in_n == 4)
32482 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX256];
32484 break;
32486 case BUILT_IN_IRINTF:
32487 case BUILT_IN_LRINTF:
32488 case BUILT_IN_LLRINTF:
32489 if (out_mode == SImode && in_mode == SFmode)
32491 if (out_n == 4 && in_n == 4)
32492 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
32493 else if (out_n == 8 && in_n == 8)
32494 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
32496 break;
32498 case BUILT_IN_IROUND:
32499 case BUILT_IN_LROUND:
32500 case BUILT_IN_LLROUND:
32501 /* The round insn does not trap on denormals. */
32502 if (flag_trapping_math || !TARGET_ROUND)
32503 break;
32505 if (out_mode == SImode && in_mode == DFmode)
32507 if (out_n == 4 && in_n == 2)
32508 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX];
32509 else if (out_n == 8 && in_n == 4)
32510 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256];
32512 break;
32514 case BUILT_IN_IROUNDF:
32515 case BUILT_IN_LROUNDF:
32516 case BUILT_IN_LLROUNDF:
32517 /* The round insn does not trap on denormals. */
32518 if (flag_trapping_math || !TARGET_ROUND)
32519 break;
32521 if (out_mode == SImode && in_mode == SFmode)
32523 if (out_n == 4 && in_n == 4)
32524 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX];
32525 else if (out_n == 8 && in_n == 8)
32526 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX256];
32528 break;
32530 case BUILT_IN_COPYSIGN:
32531 if (out_mode == DFmode && in_mode == DFmode)
32533 if (out_n == 2 && in_n == 2)
32534 return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
32535 else if (out_n == 4 && in_n == 4)
32536 return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
32538 break;
32540 case BUILT_IN_COPYSIGNF:
32541 if (out_mode == SFmode && in_mode == SFmode)
32543 if (out_n == 4 && in_n == 4)
32544 return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
32545 else if (out_n == 8 && in_n == 8)
32546 return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
32548 break;
32550 case BUILT_IN_FLOOR:
32551 /* The round insn does not trap on denormals. */
32552 if (flag_trapping_math || !TARGET_ROUND)
32553 break;
32555 if (out_mode == DFmode && in_mode == DFmode)
32557 if (out_n == 2 && in_n == 2)
32558 return ix86_builtins[IX86_BUILTIN_FLOORPD];
32559 else if (out_n == 4 && in_n == 4)
32560 return ix86_builtins[IX86_BUILTIN_FLOORPD256];
32562 break;
32564 case BUILT_IN_FLOORF:
32565 /* The round insn does not trap on denormals. */
32566 if (flag_trapping_math || !TARGET_ROUND)
32567 break;
32569 if (out_mode == SFmode && in_mode == SFmode)
32571 if (out_n == 4 && in_n == 4)
32572 return ix86_builtins[IX86_BUILTIN_FLOORPS];
32573 else if (out_n == 8 && in_n == 8)
32574 return ix86_builtins[IX86_BUILTIN_FLOORPS256];
32576 break;
32578 case BUILT_IN_CEIL:
32579 /* The round insn does not trap on denormals. */
32580 if (flag_trapping_math || !TARGET_ROUND)
32581 break;
32583 if (out_mode == DFmode && in_mode == DFmode)
32585 if (out_n == 2 && in_n == 2)
32586 return ix86_builtins[IX86_BUILTIN_CEILPD];
32587 else if (out_n == 4 && in_n == 4)
32588 return ix86_builtins[IX86_BUILTIN_CEILPD256];
32590 break;
32592 case BUILT_IN_CEILF:
32593 /* The round insn does not trap on denormals. */
32594 if (flag_trapping_math || !TARGET_ROUND)
32595 break;
32597 if (out_mode == SFmode && in_mode == SFmode)
32599 if (out_n == 4 && in_n == 4)
32600 return ix86_builtins[IX86_BUILTIN_CEILPS];
32601 else if (out_n == 8 && in_n == 8)
32602 return ix86_builtins[IX86_BUILTIN_CEILPS256];
32604 break;
32606 case BUILT_IN_TRUNC:
32607 /* The round insn does not trap on denormals. */
32608 if (flag_trapping_math || !TARGET_ROUND)
32609 break;
32611 if (out_mode == DFmode && in_mode == DFmode)
32613 if (out_n == 2 && in_n == 2)
32614 return ix86_builtins[IX86_BUILTIN_TRUNCPD];
32615 else if (out_n == 4 && in_n == 4)
32616 return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
32618 break;
32620 case BUILT_IN_TRUNCF:
32621 /* The round insn does not trap on denormals. */
32622 if (flag_trapping_math || !TARGET_ROUND)
32623 break;
32625 if (out_mode == SFmode && in_mode == SFmode)
32627 if (out_n == 4 && in_n == 4)
32628 return ix86_builtins[IX86_BUILTIN_TRUNCPS];
32629 else if (out_n == 8 && in_n == 8)
32630 return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
32632 break;
32634 case BUILT_IN_RINT:
32635 /* The round insn does not trap on denormals. */
32636 if (flag_trapping_math || !TARGET_ROUND)
32637 break;
32639 if (out_mode == DFmode && in_mode == DFmode)
32641 if (out_n == 2 && in_n == 2)
32642 return ix86_builtins[IX86_BUILTIN_RINTPD];
32643 else if (out_n == 4 && in_n == 4)
32644 return ix86_builtins[IX86_BUILTIN_RINTPD256];
32646 break;
32648 case BUILT_IN_RINTF:
32649 /* The round insn does not trap on denormals. */
32650 if (flag_trapping_math || !TARGET_ROUND)
32651 break;
32653 if (out_mode == SFmode && in_mode == SFmode)
32655 if (out_n == 4 && in_n == 4)
32656 return ix86_builtins[IX86_BUILTIN_RINTPS];
32657 else if (out_n == 8 && in_n == 8)
32658 return ix86_builtins[IX86_BUILTIN_RINTPS256];
32660 break;
32662 case BUILT_IN_ROUND:
32663 /* The round insn does not trap on denormals. */
32664 if (flag_trapping_math || !TARGET_ROUND)
32665 break;
32667 if (out_mode == DFmode && in_mode == DFmode)
32669 if (out_n == 2 && in_n == 2)
32670 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ];
32671 else if (out_n == 4 && in_n == 4)
32672 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ256];
32674 break;
32676 case BUILT_IN_ROUNDF:
32677 /* The round insn does not trap on denormals. */
32678 if (flag_trapping_math || !TARGET_ROUND)
32679 break;
32681 if (out_mode == SFmode && in_mode == SFmode)
32683 if (out_n == 4 && in_n == 4)
32684 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ];
32685 else if (out_n == 8 && in_n == 8)
32686 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ256];
32688 break;
32690 case BUILT_IN_FMA:
32691 if (out_mode == DFmode && in_mode == DFmode)
32693 if (out_n == 2 && in_n == 2)
32694 return ix86_builtins[IX86_BUILTIN_VFMADDPD];
32695 if (out_n == 4 && in_n == 4)
32696 return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
32698 break;
32700 case BUILT_IN_FMAF:
32701 if (out_mode == SFmode && in_mode == SFmode)
32703 if (out_n == 4 && in_n == 4)
32704 return ix86_builtins[IX86_BUILTIN_VFMADDPS];
32705 if (out_n == 8 && in_n == 8)
32706 return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
32708 break;
32710 default:
32711 break;
32714 /* Dispatch to a handler for a vectorization library. */
32715 if (ix86_veclib_handler)
32716 return ix86_veclib_handler ((enum built_in_function) fn, type_out,
32717 type_in);
32719 return NULL_TREE;
32722 /* Handler for an SVML-style interface to
32723 a library with vectorized intrinsics. */
32725 static tree
32726 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
32728 char name[20];
32729 tree fntype, new_fndecl, args;
32730 unsigned arity;
32731 const char *bname;
32732 enum machine_mode el_mode, in_mode;
32733 int n, in_n;
32735 /* The SVML is suitable for unsafe math only. */
32736 if (!flag_unsafe_math_optimizations)
32737 return NULL_TREE;
32739 el_mode = TYPE_MODE (TREE_TYPE (type_out));
32740 n = TYPE_VECTOR_SUBPARTS (type_out);
32741 in_mode = TYPE_MODE (TREE_TYPE (type_in));
32742 in_n = TYPE_VECTOR_SUBPARTS (type_in);
32743 if (el_mode != in_mode
32744 || n != in_n)
32745 return NULL_TREE;
32747 switch (fn)
32749 case BUILT_IN_EXP:
32750 case BUILT_IN_LOG:
32751 case BUILT_IN_LOG10:
32752 case BUILT_IN_POW:
32753 case BUILT_IN_TANH:
32754 case BUILT_IN_TAN:
32755 case BUILT_IN_ATAN:
32756 case BUILT_IN_ATAN2:
32757 case BUILT_IN_ATANH:
32758 case BUILT_IN_CBRT:
32759 case BUILT_IN_SINH:
32760 case BUILT_IN_SIN:
32761 case BUILT_IN_ASINH:
32762 case BUILT_IN_ASIN:
32763 case BUILT_IN_COSH:
32764 case BUILT_IN_COS:
32765 case BUILT_IN_ACOSH:
32766 case BUILT_IN_ACOS:
32767 if (el_mode != DFmode || n != 2)
32768 return NULL_TREE;
32769 break;
32771 case BUILT_IN_EXPF:
32772 case BUILT_IN_LOGF:
32773 case BUILT_IN_LOG10F:
32774 case BUILT_IN_POWF:
32775 case BUILT_IN_TANHF:
32776 case BUILT_IN_TANF:
32777 case BUILT_IN_ATANF:
32778 case BUILT_IN_ATAN2F:
32779 case BUILT_IN_ATANHF:
32780 case BUILT_IN_CBRTF:
32781 case BUILT_IN_SINHF:
32782 case BUILT_IN_SINF:
32783 case BUILT_IN_ASINHF:
32784 case BUILT_IN_ASINF:
32785 case BUILT_IN_COSHF:
32786 case BUILT_IN_COSF:
32787 case BUILT_IN_ACOSHF:
32788 case BUILT_IN_ACOSF:
32789 if (el_mode != SFmode || n != 4)
32790 return NULL_TREE;
32791 break;
32793 default:
32794 return NULL_TREE;
32797 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
32799 if (fn == BUILT_IN_LOGF)
32800 strcpy (name, "vmlsLn4");
32801 else if (fn == BUILT_IN_LOG)
32802 strcpy (name, "vmldLn2");
32803 else if (n == 4)
32805 sprintf (name, "vmls%s", bname+10);
32806 name[strlen (name)-1] = '4';
32808 else
32809 sprintf (name, "vmld%s2", bname+10);
32811 /* Convert to uppercase. */
32812 name[4] &= ~0x20;
32814 arity = 0;
32815 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
32816 args;
32817 args = TREE_CHAIN (args))
32818 arity++;
32820 if (arity == 1)
32821 fntype = build_function_type_list (type_out, type_in, NULL);
32822 else
32823 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
32825 /* Build a function declaration for the vectorized function. */
32826 new_fndecl = build_decl (BUILTINS_LOCATION,
32827 FUNCTION_DECL, get_identifier (name), fntype);
32828 TREE_PUBLIC (new_fndecl) = 1;
32829 DECL_EXTERNAL (new_fndecl) = 1;
32830 DECL_IS_NOVOPS (new_fndecl) = 1;
32831 TREE_READONLY (new_fndecl) = 1;
32833 return new_fndecl;
32836 /* Handler for an ACML-style interface to
32837 a library with vectorized intrinsics. */
32839 static tree
32840 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
32842 char name[20] = "__vr.._";
32843 tree fntype, new_fndecl, args;
32844 unsigned arity;
32845 const char *bname;
32846 enum machine_mode el_mode, in_mode;
32847 int n, in_n;
32849 /* The ACML is 64bits only and suitable for unsafe math only as
32850 it does not correctly support parts of IEEE with the required
32851 precision such as denormals. */
32852 if (!TARGET_64BIT
32853 || !flag_unsafe_math_optimizations)
32854 return NULL_TREE;
32856 el_mode = TYPE_MODE (TREE_TYPE (type_out));
32857 n = TYPE_VECTOR_SUBPARTS (type_out);
32858 in_mode = TYPE_MODE (TREE_TYPE (type_in));
32859 in_n = TYPE_VECTOR_SUBPARTS (type_in);
32860 if (el_mode != in_mode
32861 || n != in_n)
32862 return NULL_TREE;
32864 switch (fn)
32866 case BUILT_IN_SIN:
32867 case BUILT_IN_COS:
32868 case BUILT_IN_EXP:
32869 case BUILT_IN_LOG:
32870 case BUILT_IN_LOG2:
32871 case BUILT_IN_LOG10:
32872 name[4] = 'd';
32873 name[5] = '2';
32874 if (el_mode != DFmode
32875 || n != 2)
32876 return NULL_TREE;
32877 break;
32879 case BUILT_IN_SINF:
32880 case BUILT_IN_COSF:
32881 case BUILT_IN_EXPF:
32882 case BUILT_IN_POWF:
32883 case BUILT_IN_LOGF:
32884 case BUILT_IN_LOG2F:
32885 case BUILT_IN_LOG10F:
32886 name[4] = 's';
32887 name[5] = '4';
32888 if (el_mode != SFmode
32889 || n != 4)
32890 return NULL_TREE;
32891 break;
32893 default:
32894 return NULL_TREE;
32897 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
32898 sprintf (name + 7, "%s", bname+10);
32900 arity = 0;
32901 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
32902 args;
32903 args = TREE_CHAIN (args))
32904 arity++;
32906 if (arity == 1)
32907 fntype = build_function_type_list (type_out, type_in, NULL);
32908 else
32909 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
32911 /* Build a function declaration for the vectorized function. */
32912 new_fndecl = build_decl (BUILTINS_LOCATION,
32913 FUNCTION_DECL, get_identifier (name), fntype);
32914 TREE_PUBLIC (new_fndecl) = 1;
32915 DECL_EXTERNAL (new_fndecl) = 1;
32916 DECL_IS_NOVOPS (new_fndecl) = 1;
32917 TREE_READONLY (new_fndecl) = 1;
32919 return new_fndecl;
32922 /* Returns a decl of a function that implements gather load with
32923 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
32924 Return NULL_TREE if it is not available. */
32926 static tree
32927 ix86_vectorize_builtin_gather (const_tree mem_vectype,
32928 const_tree index_type, int scale)
32930 bool si;
32931 enum ix86_builtins code;
32933 if (! TARGET_AVX2)
32934 return NULL_TREE;
32936 if ((TREE_CODE (index_type) != INTEGER_TYPE
32937 && !POINTER_TYPE_P (index_type))
32938 || (TYPE_MODE (index_type) != SImode
32939 && TYPE_MODE (index_type) != DImode))
32940 return NULL_TREE;
32942 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
32943 return NULL_TREE;
32945 /* v*gather* insn sign extends index to pointer mode. */
32946 if (TYPE_PRECISION (index_type) < POINTER_SIZE
32947 && TYPE_UNSIGNED (index_type))
32948 return NULL_TREE;
32950 if (scale <= 0
32951 || scale > 8
32952 || (scale & (scale - 1)) != 0)
32953 return NULL_TREE;
32955 si = TYPE_MODE (index_type) == SImode;
32956 switch (TYPE_MODE (mem_vectype))
32958 case V2DFmode:
32959 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
32960 break;
32961 case V4DFmode:
32962 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
32963 break;
32964 case V2DImode:
32965 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
32966 break;
32967 case V4DImode:
32968 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
32969 break;
32970 case V4SFmode:
32971 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
32972 break;
32973 case V8SFmode:
32974 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
32975 break;
32976 case V4SImode:
32977 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
32978 break;
32979 case V8SImode:
32980 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
32981 break;
32982 default:
32983 return NULL_TREE;
32986 return ix86_builtins[code];
32989 /* Returns a code for a target-specific builtin that implements
32990 reciprocal of the function, or NULL_TREE if not available. */
32992 static tree
32993 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
32994 bool sqrt ATTRIBUTE_UNUSED)
32996 if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
32997 && flag_finite_math_only && !flag_trapping_math
32998 && flag_unsafe_math_optimizations))
32999 return NULL_TREE;
33001 if (md_fn)
33002 /* Machine dependent builtins. */
33003 switch (fn)
33005 /* Vectorized version of sqrt to rsqrt conversion. */
33006 case IX86_BUILTIN_SQRTPS_NR:
33007 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
33009 case IX86_BUILTIN_SQRTPS_NR256:
33010 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
33012 default:
33013 return NULL_TREE;
33015 else
33016 /* Normal builtins. */
33017 switch (fn)
33019 /* Sqrt to rsqrt conversion. */
33020 case BUILT_IN_SQRTF:
33021 return ix86_builtins[IX86_BUILTIN_RSQRTF];
33023 default:
33024 return NULL_TREE;
33028 /* Helper for avx_vpermilps256_operand et al. This is also used by
33029 the expansion functions to turn the parallel back into a mask.
33030 The return value is 0 for no match and the imm8+1 for a match. */
33033 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
33035 unsigned i, nelt = GET_MODE_NUNITS (mode);
33036 unsigned mask = 0;
33037 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
33039 if (XVECLEN (par, 0) != (int) nelt)
33040 return 0;
33042 /* Validate that all of the elements are constants, and not totally
33043 out of range. Copy the data into an integral array to make the
33044 subsequent checks easier. */
33045 for (i = 0; i < nelt; ++i)
33047 rtx er = XVECEXP (par, 0, i);
33048 unsigned HOST_WIDE_INT ei;
33050 if (!CONST_INT_P (er))
33051 return 0;
33052 ei = INTVAL (er);
33053 if (ei >= nelt)
33054 return 0;
33055 ipar[i] = ei;
33058 switch (mode)
33060 case V4DFmode:
33061 /* In the 256-bit DFmode case, we can only move elements within
33062 a 128-bit lane. */
33063 for (i = 0; i < 2; ++i)
33065 if (ipar[i] >= 2)
33066 return 0;
33067 mask |= ipar[i] << i;
33069 for (i = 2; i < 4; ++i)
33071 if (ipar[i] < 2)
33072 return 0;
33073 mask |= (ipar[i] - 2) << i;
33075 break;
33077 case V8SFmode:
33078 /* In the 256-bit SFmode case, we have full freedom of movement
33079 within the low 128-bit lane, but the high 128-bit lane must
33080 mirror the exact same pattern. */
33081 for (i = 0; i < 4; ++i)
33082 if (ipar[i] + 4 != ipar[i + 4])
33083 return 0;
33084 nelt = 4;
33085 /* FALLTHRU */
33087 case V2DFmode:
33088 case V4SFmode:
33089 /* In the 128-bit case, we've full freedom in the placement of
33090 the elements from the source operand. */
33091 for (i = 0; i < nelt; ++i)
33092 mask |= ipar[i] << (i * (nelt / 2));
33093 break;
33095 default:
33096 gcc_unreachable ();
33099 /* Make sure success has a non-zero value by adding one. */
33100 return mask + 1;
33103 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
33104 the expansion functions to turn the parallel back into a mask.
33105 The return value is 0 for no match and the imm8+1 for a match. */
33108 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
33110 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
33111 unsigned mask = 0;
33112 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
33114 if (XVECLEN (par, 0) != (int) nelt)
33115 return 0;
33117 /* Validate that all of the elements are constants, and not totally
33118 out of range. Copy the data into an integral array to make the
33119 subsequent checks easier. */
33120 for (i = 0; i < nelt; ++i)
33122 rtx er = XVECEXP (par, 0, i);
33123 unsigned HOST_WIDE_INT ei;
33125 if (!CONST_INT_P (er))
33126 return 0;
33127 ei = INTVAL (er);
33128 if (ei >= 2 * nelt)
33129 return 0;
33130 ipar[i] = ei;
33133 /* Validate that the halves of the permute are halves. */
33134 for (i = 0; i < nelt2 - 1; ++i)
33135 if (ipar[i] + 1 != ipar[i + 1])
33136 return 0;
33137 for (i = nelt2; i < nelt - 1; ++i)
33138 if (ipar[i] + 1 != ipar[i + 1])
33139 return 0;
33141 /* Reconstruct the mask. */
33142 for (i = 0; i < 2; ++i)
33144 unsigned e = ipar[i * nelt2];
33145 if (e % nelt2)
33146 return 0;
33147 e /= nelt2;
33148 mask |= e << (i * 4);
33151 /* Make sure success has a non-zero value by adding one. */
33152 return mask + 1;
33155 /* Store OPERAND to the memory after reload is completed. This means
33156 that we can't easily use assign_stack_local. */
33158 ix86_force_to_memory (enum machine_mode mode, rtx operand)
33160 rtx result;
33162 gcc_assert (reload_completed);
33163 if (ix86_using_red_zone ())
33165 result = gen_rtx_MEM (mode,
33166 gen_rtx_PLUS (Pmode,
33167 stack_pointer_rtx,
33168 GEN_INT (-RED_ZONE_SIZE)));
33169 emit_move_insn (result, operand);
33171 else if (TARGET_64BIT)
33173 switch (mode)
33175 case HImode:
33176 case SImode:
33177 operand = gen_lowpart (DImode, operand);
33178 /* FALLTHRU */
33179 case DImode:
33180 emit_insn (
33181 gen_rtx_SET (VOIDmode,
33182 gen_rtx_MEM (DImode,
33183 gen_rtx_PRE_DEC (DImode,
33184 stack_pointer_rtx)),
33185 operand));
33186 break;
33187 default:
33188 gcc_unreachable ();
33190 result = gen_rtx_MEM (mode, stack_pointer_rtx);
33192 else
33194 switch (mode)
33196 case DImode:
33198 rtx operands[2];
33199 split_double_mode (mode, &operand, 1, operands, operands + 1);
33200 emit_insn (
33201 gen_rtx_SET (VOIDmode,
33202 gen_rtx_MEM (SImode,
33203 gen_rtx_PRE_DEC (Pmode,
33204 stack_pointer_rtx)),
33205 operands[1]));
33206 emit_insn (
33207 gen_rtx_SET (VOIDmode,
33208 gen_rtx_MEM (SImode,
33209 gen_rtx_PRE_DEC (Pmode,
33210 stack_pointer_rtx)),
33211 operands[0]));
33213 break;
33214 case HImode:
33215 /* Store HImodes as SImodes. */
33216 operand = gen_lowpart (SImode, operand);
33217 /* FALLTHRU */
33218 case SImode:
33219 emit_insn (
33220 gen_rtx_SET (VOIDmode,
33221 gen_rtx_MEM (GET_MODE (operand),
33222 gen_rtx_PRE_DEC (SImode,
33223 stack_pointer_rtx)),
33224 operand));
33225 break;
33226 default:
33227 gcc_unreachable ();
33229 result = gen_rtx_MEM (mode, stack_pointer_rtx);
33231 return result;
33234 /* Free operand from the memory. */
33235 void
33236 ix86_free_from_memory (enum machine_mode mode)
33238 if (!ix86_using_red_zone ())
33240 int size;
33242 if (mode == DImode || TARGET_64BIT)
33243 size = 8;
33244 else
33245 size = 4;
33246 /* Use LEA to deallocate stack space. In peephole2 it will be converted
33247 to pop or add instruction if registers are available. */
33248 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
33249 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
33250 GEN_INT (size))));
33254 /* Return a register priority for hard reg REGNO. */
33255 static int
33256 ix86_register_priority (int hard_regno)
33258 /* ebp and r13 as the base always wants a displacement, r12 as the
33259 base always wants an index. So discourage their usage in an
33260 address. */
33261 if (hard_regno == R12_REG || hard_regno == R13_REG)
33262 return 0;
33263 if (hard_regno == BP_REG)
33264 return 1;
33265 /* New x86-64 int registers result in bigger code size. Discourage
33266 them. */
33267 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
33268 return 2;
33269 /* New x86-64 SSE registers result in bigger code size. Discourage
33270 them. */
33271 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
33272 return 2;
33273 /* Usage of AX register results in smaller code. Prefer it. */
33274 if (hard_regno == 0)
33275 return 4;
33276 return 3;
33279 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
33281 Put float CONST_DOUBLE in the constant pool instead of fp regs.
33282 QImode must go into class Q_REGS.
33283 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
33284 movdf to do mem-to-mem moves through integer regs. */
33286 static reg_class_t
33287 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
33289 enum machine_mode mode = GET_MODE (x);
33291 /* We're only allowed to return a subclass of CLASS. Many of the
33292 following checks fail for NO_REGS, so eliminate that early. */
33293 if (regclass == NO_REGS)
33294 return NO_REGS;
33296 /* All classes can load zeros. */
33297 if (x == CONST0_RTX (mode))
33298 return regclass;
33300 /* Force constants into memory if we are loading a (nonzero) constant into
33301 an MMX or SSE register. This is because there are no MMX/SSE instructions
33302 to load from a constant. */
33303 if (CONSTANT_P (x)
33304 && (MAYBE_MMX_CLASS_P (regclass) || MAYBE_SSE_CLASS_P (regclass)))
33305 return NO_REGS;
33307 /* Prefer SSE regs only, if we can use them for math. */
33308 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
33309 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
33311 /* Floating-point constants need more complex checks. */
33312 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
33314 /* General regs can load everything. */
33315 if (reg_class_subset_p (regclass, GENERAL_REGS))
33316 return regclass;
33318 /* Floats can load 0 and 1 plus some others. Note that we eliminated
33319 zero above. We only want to wind up preferring 80387 registers if
33320 we plan on doing computation with them. */
33321 if (TARGET_80387
33322 && standard_80387_constant_p (x) > 0)
33324 /* Limit class to non-sse. */
33325 if (regclass == FLOAT_SSE_REGS)
33326 return FLOAT_REGS;
33327 if (regclass == FP_TOP_SSE_REGS)
33328 return FP_TOP_REG;
33329 if (regclass == FP_SECOND_SSE_REGS)
33330 return FP_SECOND_REG;
33331 if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
33332 return regclass;
33335 return NO_REGS;
33338 /* Generally when we see PLUS here, it's the function invariant
33339 (plus soft-fp const_int). Which can only be computed into general
33340 regs. */
33341 if (GET_CODE (x) == PLUS)
33342 return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
33344 /* QImode constants are easy to load, but non-constant QImode data
33345 must go into Q_REGS. */
33346 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
33348 if (reg_class_subset_p (regclass, Q_REGS))
33349 return regclass;
33350 if (reg_class_subset_p (Q_REGS, regclass))
33351 return Q_REGS;
33352 return NO_REGS;
33355 return regclass;
33358 /* Discourage putting floating-point values in SSE registers unless
33359 SSE math is being used, and likewise for the 387 registers. */
33360 static reg_class_t
33361 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
33363 enum machine_mode mode = GET_MODE (x);
33365 /* Restrict the output reload class to the register bank that we are doing
33366 math on. If we would like not to return a subset of CLASS, reject this
33367 alternative: if reload cannot do this, it will still use its choice. */
33368 mode = GET_MODE (x);
33369 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
33370 return MAYBE_SSE_CLASS_P (regclass) ? SSE_REGS : NO_REGS;
33372 if (X87_FLOAT_MODE_P (mode))
33374 if (regclass == FP_TOP_SSE_REGS)
33375 return FP_TOP_REG;
33376 else if (regclass == FP_SECOND_SSE_REGS)
33377 return FP_SECOND_REG;
33378 else
33379 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
33382 return regclass;
33385 static reg_class_t
33386 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
33387 enum machine_mode mode, secondary_reload_info *sri)
33389 /* Double-word spills from general registers to non-offsettable memory
33390 references (zero-extended addresses) require special handling. */
33391 if (TARGET_64BIT
33392 && MEM_P (x)
33393 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
33394 && rclass == GENERAL_REGS
33395 && !offsettable_memref_p (x))
33397 sri->icode = (in_p
33398 ? CODE_FOR_reload_noff_load
33399 : CODE_FOR_reload_noff_store);
33400 /* Add the cost of moving address to a temporary. */
33401 sri->extra_cost = 1;
33403 return NO_REGS;
33406 /* QImode spills from non-QI registers require
33407 intermediate register on 32bit targets. */
33408 if (!TARGET_64BIT
33409 && !in_p && mode == QImode
33410 && (rclass == GENERAL_REGS
33411 || rclass == LEGACY_REGS
33412 || rclass == NON_Q_REGS
33413 || rclass == SIREG
33414 || rclass == DIREG
33415 || rclass == INDEX_REGS))
33417 int regno;
33419 if (REG_P (x))
33420 regno = REGNO (x);
33421 else
33422 regno = -1;
33424 if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
33425 regno = true_regnum (x);
33427 /* Return Q_REGS if the operand is in memory. */
33428 if (regno == -1)
33429 return Q_REGS;
33432 /* This condition handles corner case where an expression involving
33433 pointers gets vectorized. We're trying to use the address of a
33434 stack slot as a vector initializer.
33436 (set (reg:V2DI 74 [ vect_cst_.2 ])
33437 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
33439 Eventually frame gets turned into sp+offset like this:
33441 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33442 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
33443 (const_int 392 [0x188]))))
33445 That later gets turned into:
33447 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33448 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
33449 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
33451 We'll have the following reload recorded:
33453 Reload 0: reload_in (DI) =
33454 (plus:DI (reg/f:DI 7 sp)
33455 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
33456 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33457 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
33458 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
33459 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33460 reload_reg_rtx: (reg:V2DI 22 xmm1)
33462 Which isn't going to work since SSE instructions can't handle scalar
33463 additions. Returning GENERAL_REGS forces the addition into integer
33464 register and reload can handle subsequent reloads without problems. */
33466 if (in_p && GET_CODE (x) == PLUS
33467 && SSE_CLASS_P (rclass)
33468 && SCALAR_INT_MODE_P (mode))
33469 return GENERAL_REGS;
33471 return NO_REGS;
33474 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
33476 static bool
33477 ix86_class_likely_spilled_p (reg_class_t rclass)
33479 switch (rclass)
33481 case AREG:
33482 case DREG:
33483 case CREG:
33484 case BREG:
33485 case AD_REGS:
33486 case SIREG:
33487 case DIREG:
33488 case SSE_FIRST_REG:
33489 case FP_TOP_REG:
33490 case FP_SECOND_REG:
33491 return true;
33493 default:
33494 break;
33497 return false;
33500 /* If we are copying between general and FP registers, we need a memory
33501 location. The same is true for SSE and MMX registers.
33503 To optimize register_move_cost performance, allow inline variant.
33505 The macro can't work reliably when one of the CLASSES is class containing
33506 registers from multiple units (SSE, MMX, integer). We avoid this by never
33507 combining those units in single alternative in the machine description.
33508 Ensure that this constraint holds to avoid unexpected surprises.
33510 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
33511 enforce these sanity checks. */
33513 static inline bool
33514 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
33515 enum machine_mode mode, int strict)
33517 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
33518 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
33519 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
33520 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
33521 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
33522 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
33524 gcc_assert (!strict || lra_in_progress);
33525 return true;
33528 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
33529 return true;
33531 /* ??? This is a lie. We do have moves between mmx/general, and for
33532 mmx/sse2. But by saying we need secondary memory we discourage the
33533 register allocator from using the mmx registers unless needed. */
33534 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
33535 return true;
33537 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
33539 /* SSE1 doesn't have any direct moves from other classes. */
33540 if (!TARGET_SSE2)
33541 return true;
33543 /* If the target says that inter-unit moves are more expensive
33544 than moving through memory, then don't generate them. */
33545 if (!TARGET_INTER_UNIT_MOVES)
33546 return true;
33548 /* Between SSE and general, we have moves no larger than word size. */
33549 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
33550 return true;
33553 return false;
33556 bool
33557 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
33558 enum machine_mode mode, int strict)
33560 return inline_secondary_memory_needed (class1, class2, mode, strict);
33563 /* Implement the TARGET_CLASS_MAX_NREGS hook.
33565 On the 80386, this is the size of MODE in words,
33566 except in the FP regs, where a single reg is always enough. */
33568 static unsigned char
33569 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
33571 if (MAYBE_INTEGER_CLASS_P (rclass))
33573 if (mode == XFmode)
33574 return (TARGET_64BIT ? 2 : 3);
33575 else if (mode == XCmode)
33576 return (TARGET_64BIT ? 4 : 6);
33577 else
33578 return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
33580 else
33582 if (COMPLEX_MODE_P (mode))
33583 return 2;
33584 else
33585 return 1;
33589 /* Return true if the registers in CLASS cannot represent the change from
33590 modes FROM to TO. */
33592 bool
33593 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
33594 enum reg_class regclass)
33596 if (from == to)
33597 return false;
33599 /* x87 registers can't do subreg at all, as all values are reformatted
33600 to extended precision. */
33601 if (MAYBE_FLOAT_CLASS_P (regclass))
33602 return true;
33604 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
33606 /* Vector registers do not support QI or HImode loads. If we don't
33607 disallow a change to these modes, reload will assume it's ok to
33608 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
33609 the vec_dupv4hi pattern. */
33610 if (GET_MODE_SIZE (from) < 4)
33611 return true;
33613 /* Vector registers do not support subreg with nonzero offsets, which
33614 are otherwise valid for integer registers. Since we can't see
33615 whether we have a nonzero offset from here, prohibit all
33616 nonparadoxical subregs changing size. */
33617 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
33618 return true;
33621 return false;
33624 /* Return the cost of moving data of mode M between a
33625 register and memory. A value of 2 is the default; this cost is
33626 relative to those in `REGISTER_MOVE_COST'.
33628 This function is used extensively by register_move_cost that is used to
33629 build tables at startup. Make it inline in this case.
33630 When IN is 2, return maximum of in and out move cost.
33632 If moving between registers and memory is more expensive than
33633 between two registers, you should define this macro to express the
33634 relative cost.
33636 Model also increased moving costs of QImode registers in non
33637 Q_REGS classes.
33639 static inline int
33640 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
33641 int in)
33643 int cost;
33644 if (FLOAT_CLASS_P (regclass))
33646 int index;
33647 switch (mode)
33649 case SFmode:
33650 index = 0;
33651 break;
33652 case DFmode:
33653 index = 1;
33654 break;
33655 case XFmode:
33656 index = 2;
33657 break;
33658 default:
33659 return 100;
33661 if (in == 2)
33662 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
33663 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
33665 if (SSE_CLASS_P (regclass))
33667 int index;
33668 switch (GET_MODE_SIZE (mode))
33670 case 4:
33671 index = 0;
33672 break;
33673 case 8:
33674 index = 1;
33675 break;
33676 case 16:
33677 index = 2;
33678 break;
33679 default:
33680 return 100;
33682 if (in == 2)
33683 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
33684 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
33686 if (MMX_CLASS_P (regclass))
33688 int index;
33689 switch (GET_MODE_SIZE (mode))
33691 case 4:
33692 index = 0;
33693 break;
33694 case 8:
33695 index = 1;
33696 break;
33697 default:
33698 return 100;
33700 if (in)
33701 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
33702 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
33704 switch (GET_MODE_SIZE (mode))
33706 case 1:
33707 if (Q_CLASS_P (regclass) || TARGET_64BIT)
33709 if (!in)
33710 return ix86_cost->int_store[0];
33711 if (TARGET_PARTIAL_REG_DEPENDENCY
33712 && optimize_function_for_speed_p (cfun))
33713 cost = ix86_cost->movzbl_load;
33714 else
33715 cost = ix86_cost->int_load[0];
33716 if (in == 2)
33717 return MAX (cost, ix86_cost->int_store[0]);
33718 return cost;
33720 else
33722 if (in == 2)
33723 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
33724 if (in)
33725 return ix86_cost->movzbl_load;
33726 else
33727 return ix86_cost->int_store[0] + 4;
33729 break;
33730 case 2:
33731 if (in == 2)
33732 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
33733 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
33734 default:
33735 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
33736 if (mode == TFmode)
33737 mode = XFmode;
33738 if (in == 2)
33739 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
33740 else if (in)
33741 cost = ix86_cost->int_load[2];
33742 else
33743 cost = ix86_cost->int_store[2];
33744 return (cost * (((int) GET_MODE_SIZE (mode)
33745 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
33749 static int
33750 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
33751 bool in)
33753 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
33757 /* Return the cost of moving data from a register in class CLASS1 to
33758 one in class CLASS2.
33760 It is not required that the cost always equal 2 when FROM is the same as TO;
33761 on some machines it is expensive to move between registers if they are not
33762 general registers. */
33764 static int
33765 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
33766 reg_class_t class2_i)
33768 enum reg_class class1 = (enum reg_class) class1_i;
33769 enum reg_class class2 = (enum reg_class) class2_i;
33771 /* In case we require secondary memory, compute cost of the store followed
33772 by load. In order to avoid bad register allocation choices, we need
33773 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
33775 if (inline_secondary_memory_needed (class1, class2, mode, 0))
33777 int cost = 1;
33779 cost += inline_memory_move_cost (mode, class1, 2);
33780 cost += inline_memory_move_cost (mode, class2, 2);
33782 /* In case of copying from general_purpose_register we may emit multiple
33783 stores followed by single load causing memory size mismatch stall.
33784 Count this as arbitrarily high cost of 20. */
33785 if (targetm.class_max_nregs (class1, mode)
33786 > targetm.class_max_nregs (class2, mode))
33787 cost += 20;
33789 /* In the case of FP/MMX moves, the registers actually overlap, and we
33790 have to switch modes in order to treat them differently. */
33791 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
33792 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
33793 cost += 20;
33795 return cost;
33798 /* Moves between SSE/MMX and integer unit are expensive. */
33799 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
33800 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
33802 /* ??? By keeping returned value relatively high, we limit the number
33803 of moves between integer and MMX/SSE registers for all targets.
33804 Additionally, high value prevents problem with x86_modes_tieable_p(),
33805 where integer modes in MMX/SSE registers are not tieable
33806 because of missing QImode and HImode moves to, from or between
33807 MMX/SSE registers. */
33808 return MAX (8, ix86_cost->mmxsse_to_integer);
33810 if (MAYBE_FLOAT_CLASS_P (class1))
33811 return ix86_cost->fp_move;
33812 if (MAYBE_SSE_CLASS_P (class1))
33813 return ix86_cost->sse_move;
33814 if (MAYBE_MMX_CLASS_P (class1))
33815 return ix86_cost->mmx_move;
33816 return 2;
33819 /* Return TRUE if hard register REGNO can hold a value of machine-mode
33820 MODE. */
33822 bool
33823 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
33825 /* Flags and only flags can only hold CCmode values. */
33826 if (CC_REGNO_P (regno))
33827 return GET_MODE_CLASS (mode) == MODE_CC;
33828 if (GET_MODE_CLASS (mode) == MODE_CC
33829 || GET_MODE_CLASS (mode) == MODE_RANDOM
33830 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
33831 return false;
33832 if (STACK_REGNO_P (regno))
33833 return VALID_FP_MODE_P (mode);
33834 if (SSE_REGNO_P (regno))
33836 /* We implement the move patterns for all vector modes into and
33837 out of SSE registers, even when no operation instructions
33838 are available. OImode move is available only when AVX is
33839 enabled. */
33840 return ((TARGET_AVX && mode == OImode)
33841 || VALID_AVX256_REG_MODE (mode)
33842 || VALID_SSE_REG_MODE (mode)
33843 || VALID_SSE2_REG_MODE (mode)
33844 || VALID_MMX_REG_MODE (mode)
33845 || VALID_MMX_REG_MODE_3DNOW (mode));
33847 if (MMX_REGNO_P (regno))
33849 /* We implement the move patterns for 3DNOW modes even in MMX mode,
33850 so if the register is available at all, then we can move data of
33851 the given mode into or out of it. */
33852 return (VALID_MMX_REG_MODE (mode)
33853 || VALID_MMX_REG_MODE_3DNOW (mode));
33856 if (mode == QImode)
33858 /* Take care for QImode values - they can be in non-QI regs,
33859 but then they do cause partial register stalls. */
33860 if (TARGET_64BIT || QI_REGNO_P (regno))
33861 return true;
33862 if (!TARGET_PARTIAL_REG_STALL)
33863 return true;
33864 return !can_create_pseudo_p ();
33866 /* We handle both integer and floats in the general purpose registers. */
33867 else if (VALID_INT_MODE_P (mode))
33868 return true;
33869 else if (VALID_FP_MODE_P (mode))
33870 return true;
33871 else if (VALID_DFP_MODE_P (mode))
33872 return true;
33873 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
33874 on to use that value in smaller contexts, this can easily force a
33875 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
33876 supporting DImode, allow it. */
33877 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
33878 return true;
33880 return false;
33883 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
33884 tieable integer mode. */
33886 static bool
33887 ix86_tieable_integer_mode_p (enum machine_mode mode)
33889 switch (mode)
33891 case HImode:
33892 case SImode:
33893 return true;
33895 case QImode:
33896 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
33898 case DImode:
33899 return TARGET_64BIT;
33901 default:
33902 return false;
33906 /* Return true if MODE1 is accessible in a register that can hold MODE2
33907 without copying. That is, all register classes that can hold MODE2
33908 can also hold MODE1. */
33910 bool
33911 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
33913 if (mode1 == mode2)
33914 return true;
33916 if (ix86_tieable_integer_mode_p (mode1)
33917 && ix86_tieable_integer_mode_p (mode2))
33918 return true;
33920 /* MODE2 being XFmode implies fp stack or general regs, which means we
33921 can tie any smaller floating point modes to it. Note that we do not
33922 tie this with TFmode. */
33923 if (mode2 == XFmode)
33924 return mode1 == SFmode || mode1 == DFmode;
33926 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
33927 that we can tie it with SFmode. */
33928 if (mode2 == DFmode)
33929 return mode1 == SFmode;
33931 /* If MODE2 is only appropriate for an SSE register, then tie with
33932 any other mode acceptable to SSE registers. */
33933 if (GET_MODE_SIZE (mode2) == 32
33934 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
33935 return (GET_MODE_SIZE (mode1) == 32
33936 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
33937 if (GET_MODE_SIZE (mode2) == 16
33938 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
33939 return (GET_MODE_SIZE (mode1) == 16
33940 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
33942 /* If MODE2 is appropriate for an MMX register, then tie
33943 with any other mode acceptable to MMX registers. */
33944 if (GET_MODE_SIZE (mode2) == 8
33945 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
33946 return (GET_MODE_SIZE (mode1) == 8
33947 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
33949 return false;
33952 /* Return the cost of moving between two registers of mode MODE. */
33954 static int
33955 ix86_set_reg_reg_cost (enum machine_mode mode)
33957 unsigned int units = UNITS_PER_WORD;
33959 switch (GET_MODE_CLASS (mode))
33961 default:
33962 break;
33964 case MODE_CC:
33965 units = GET_MODE_SIZE (CCmode);
33966 break;
33968 case MODE_FLOAT:
33969 if ((TARGET_SSE && mode == TFmode)
33970 || (TARGET_80387 && mode == XFmode)
33971 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
33972 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
33973 units = GET_MODE_SIZE (mode);
33974 break;
33976 case MODE_COMPLEX_FLOAT:
33977 if ((TARGET_SSE && mode == TCmode)
33978 || (TARGET_80387 && mode == XCmode)
33979 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
33980 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
33981 units = GET_MODE_SIZE (mode);
33982 break;
33984 case MODE_VECTOR_INT:
33985 case MODE_VECTOR_FLOAT:
33986 if ((TARGET_AVX && VALID_AVX256_REG_MODE (mode))
33987 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
33988 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
33989 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
33990 units = GET_MODE_SIZE (mode);
33993 /* Return the cost of moving between two registers of mode MODE,
33994 assuming that the move will be in pieces of at most UNITS bytes. */
33995 return COSTS_N_INSNS ((GET_MODE_SIZE (mode) + units - 1) / units);
33998 /* Compute a (partial) cost for rtx X. Return true if the complete
33999 cost has been computed, and false if subexpressions should be
34000 scanned. In either case, *TOTAL contains the cost result. */
34002 static bool
34003 ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
34004 bool speed)
34006 enum rtx_code code = (enum rtx_code) code_i;
34007 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
34008 enum machine_mode mode = GET_MODE (x);
34009 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
34011 switch (code)
34013 case SET:
34014 if (register_operand (SET_DEST (x), VOIDmode)
34015 && reg_or_0_operand (SET_SRC (x), VOIDmode))
34017 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
34018 return true;
34020 return false;
34022 case CONST_INT:
34023 case CONST:
34024 case LABEL_REF:
34025 case SYMBOL_REF:
34026 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
34027 *total = 3;
34028 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
34029 *total = 2;
34030 else if (flag_pic && SYMBOLIC_CONST (x)
34031 && (!TARGET_64BIT
34032 || (!GET_CODE (x) != LABEL_REF
34033 && (GET_CODE (x) != SYMBOL_REF
34034 || !SYMBOL_REF_LOCAL_P (x)))))
34035 *total = 1;
34036 else
34037 *total = 0;
34038 return true;
34040 case CONST_DOUBLE:
34041 if (mode == VOIDmode)
34043 *total = 0;
34044 return true;
34046 switch (standard_80387_constant_p (x))
34048 case 1: /* 0.0 */
34049 *total = 1;
34050 return true;
34051 default: /* Other constants */
34052 *total = 2;
34053 return true;
34054 case 0:
34055 case -1:
34056 break;
34058 if (SSE_FLOAT_MODE_P (mode))
34060 case CONST_VECTOR:
34061 switch (standard_sse_constant_p (x))
34063 case 0:
34064 break;
34065 case 1: /* 0: xor eliminates false dependency */
34066 *total = 0;
34067 return true;
34068 default: /* -1: cmp contains false dependency */
34069 *total = 1;
34070 return true;
34073 /* Fall back to (MEM (SYMBOL_REF)), since that's where
34074 it'll probably end up. Add a penalty for size. */
34075 *total = (COSTS_N_INSNS (1)
34076 + (flag_pic != 0 && !TARGET_64BIT)
34077 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
34078 return true;
34080 case ZERO_EXTEND:
34081 /* The zero extensions is often completely free on x86_64, so make
34082 it as cheap as possible. */
34083 if (TARGET_64BIT && mode == DImode
34084 && GET_MODE (XEXP (x, 0)) == SImode)
34085 *total = 1;
34086 else if (TARGET_ZERO_EXTEND_WITH_AND)
34087 *total = cost->add;
34088 else
34089 *total = cost->movzx;
34090 return false;
34092 case SIGN_EXTEND:
34093 *total = cost->movsx;
34094 return false;
34096 case ASHIFT:
34097 if (SCALAR_INT_MODE_P (mode)
34098 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
34099 && CONST_INT_P (XEXP (x, 1)))
34101 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
34102 if (value == 1)
34104 *total = cost->add;
34105 return false;
34107 if ((value == 2 || value == 3)
34108 && cost->lea <= cost->shift_const)
34110 *total = cost->lea;
34111 return false;
34114 /* FALLTHRU */
34116 case ROTATE:
34117 case ASHIFTRT:
34118 case LSHIFTRT:
34119 case ROTATERT:
34120 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
34122 /* ??? Should be SSE vector operation cost. */
34123 /* At least for published AMD latencies, this really is the same
34124 as the latency for a simple fpu operation like fabs. */
34125 /* V*QImode is emulated with 1-11 insns. */
34126 if (mode == V16QImode || mode == V32QImode)
34128 int count = 11;
34129 if (TARGET_XOP && mode == V16QImode)
34131 /* For XOP we use vpshab, which requires a broadcast of the
34132 value to the variable shift insn. For constants this
34133 means a V16Q const in mem; even when we can perform the
34134 shift with one insn set the cost to prefer paddb. */
34135 if (CONSTANT_P (XEXP (x, 1)))
34137 *total = (cost->fabs
34138 + rtx_cost (XEXP (x, 0), code, 0, speed)
34139 + (speed ? 2 : COSTS_N_BYTES (16)));
34140 return true;
34142 count = 3;
34144 else if (TARGET_SSSE3)
34145 count = 7;
34146 *total = cost->fabs * count;
34148 else
34149 *total = cost->fabs;
34151 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34153 if (CONST_INT_P (XEXP (x, 1)))
34155 if (INTVAL (XEXP (x, 1)) > 32)
34156 *total = cost->shift_const + COSTS_N_INSNS (2);
34157 else
34158 *total = cost->shift_const * 2;
34160 else
34162 if (GET_CODE (XEXP (x, 1)) == AND)
34163 *total = cost->shift_var * 2;
34164 else
34165 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
34168 else
34170 if (CONST_INT_P (XEXP (x, 1)))
34171 *total = cost->shift_const;
34172 else
34173 *total = cost->shift_var;
34175 return false;
34177 case FMA:
34179 rtx sub;
34181 gcc_assert (FLOAT_MODE_P (mode));
34182 gcc_assert (TARGET_FMA || TARGET_FMA4);
34184 /* ??? SSE scalar/vector cost should be used here. */
34185 /* ??? Bald assumption that fma has the same cost as fmul. */
34186 *total = cost->fmul;
34187 *total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
34189 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
34190 sub = XEXP (x, 0);
34191 if (GET_CODE (sub) == NEG)
34192 sub = XEXP (sub, 0);
34193 *total += rtx_cost (sub, FMA, 0, speed);
34195 sub = XEXP (x, 2);
34196 if (GET_CODE (sub) == NEG)
34197 sub = XEXP (sub, 0);
34198 *total += rtx_cost (sub, FMA, 2, speed);
34199 return true;
34202 case MULT:
34203 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34205 /* ??? SSE scalar cost should be used here. */
34206 *total = cost->fmul;
34207 return false;
34209 else if (X87_FLOAT_MODE_P (mode))
34211 *total = cost->fmul;
34212 return false;
34214 else if (FLOAT_MODE_P (mode))
34216 /* ??? SSE vector cost should be used here. */
34217 *total = cost->fmul;
34218 return false;
34220 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
34222 /* V*QImode is emulated with 7-13 insns. */
34223 if (mode == V16QImode || mode == V32QImode)
34225 int extra = 11;
34226 if (TARGET_XOP && mode == V16QImode)
34227 extra = 5;
34228 else if (TARGET_SSSE3)
34229 extra = 6;
34230 *total = cost->fmul * 2 + cost->fabs * extra;
34232 /* V*DImode is emulated with 5-8 insns. */
34233 else if (mode == V2DImode || mode == V4DImode)
34235 if (TARGET_XOP && mode == V2DImode)
34236 *total = cost->fmul * 2 + cost->fabs * 3;
34237 else
34238 *total = cost->fmul * 3 + cost->fabs * 5;
34240 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
34241 insns, including two PMULUDQ. */
34242 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
34243 *total = cost->fmul * 2 + cost->fabs * 5;
34244 else
34245 *total = cost->fmul;
34246 return false;
34248 else
34250 rtx op0 = XEXP (x, 0);
34251 rtx op1 = XEXP (x, 1);
34252 int nbits;
34253 if (CONST_INT_P (XEXP (x, 1)))
34255 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
34256 for (nbits = 0; value != 0; value &= value - 1)
34257 nbits++;
34259 else
34260 /* This is arbitrary. */
34261 nbits = 7;
34263 /* Compute costs correctly for widening multiplication. */
34264 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
34265 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
34266 == GET_MODE_SIZE (mode))
34268 int is_mulwiden = 0;
34269 enum machine_mode inner_mode = GET_MODE (op0);
34271 if (GET_CODE (op0) == GET_CODE (op1))
34272 is_mulwiden = 1, op1 = XEXP (op1, 0);
34273 else if (CONST_INT_P (op1))
34275 if (GET_CODE (op0) == SIGN_EXTEND)
34276 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
34277 == INTVAL (op1);
34278 else
34279 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
34282 if (is_mulwiden)
34283 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
34286 *total = (cost->mult_init[MODE_INDEX (mode)]
34287 + nbits * cost->mult_bit
34288 + rtx_cost (op0, outer_code, opno, speed)
34289 + rtx_cost (op1, outer_code, opno, speed));
34291 return true;
34294 case DIV:
34295 case UDIV:
34296 case MOD:
34297 case UMOD:
34298 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34299 /* ??? SSE cost should be used here. */
34300 *total = cost->fdiv;
34301 else if (X87_FLOAT_MODE_P (mode))
34302 *total = cost->fdiv;
34303 else if (FLOAT_MODE_P (mode))
34304 /* ??? SSE vector cost should be used here. */
34305 *total = cost->fdiv;
34306 else
34307 *total = cost->divide[MODE_INDEX (mode)];
34308 return false;
34310 case PLUS:
34311 if (GET_MODE_CLASS (mode) == MODE_INT
34312 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
34314 if (GET_CODE (XEXP (x, 0)) == PLUS
34315 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
34316 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
34317 && CONSTANT_P (XEXP (x, 1)))
34319 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
34320 if (val == 2 || val == 4 || val == 8)
34322 *total = cost->lea;
34323 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
34324 outer_code, opno, speed);
34325 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
34326 outer_code, opno, speed);
34327 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
34328 return true;
34331 else if (GET_CODE (XEXP (x, 0)) == MULT
34332 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
34334 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
34335 if (val == 2 || val == 4 || val == 8)
34337 *total = cost->lea;
34338 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
34339 outer_code, opno, speed);
34340 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
34341 return true;
34344 else if (GET_CODE (XEXP (x, 0)) == PLUS)
34346 *total = cost->lea;
34347 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
34348 outer_code, opno, speed);
34349 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
34350 outer_code, opno, speed);
34351 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
34352 return true;
34355 /* FALLTHRU */
34357 case MINUS:
34358 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34360 /* ??? SSE cost should be used here. */
34361 *total = cost->fadd;
34362 return false;
34364 else if (X87_FLOAT_MODE_P (mode))
34366 *total = cost->fadd;
34367 return false;
34369 else if (FLOAT_MODE_P (mode))
34371 /* ??? SSE vector cost should be used here. */
34372 *total = cost->fadd;
34373 return false;
34375 /* FALLTHRU */
34377 case AND:
34378 case IOR:
34379 case XOR:
34380 if (GET_MODE_CLASS (mode) == MODE_INT
34381 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34383 *total = (cost->add * 2
34384 + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
34385 << (GET_MODE (XEXP (x, 0)) != DImode))
34386 + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
34387 << (GET_MODE (XEXP (x, 1)) != DImode)));
34388 return true;
34390 /* FALLTHRU */
34392 case NEG:
34393 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34395 /* ??? SSE cost should be used here. */
34396 *total = cost->fchs;
34397 return false;
34399 else if (X87_FLOAT_MODE_P (mode))
34401 *total = cost->fchs;
34402 return false;
34404 else if (FLOAT_MODE_P (mode))
34406 /* ??? SSE vector cost should be used here. */
34407 *total = cost->fchs;
34408 return false;
34410 /* FALLTHRU */
34412 case NOT:
34413 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
34415 /* ??? Should be SSE vector operation cost. */
34416 /* At least for published AMD latencies, this really is the same
34417 as the latency for a simple fpu operation like fabs. */
34418 *total = cost->fabs;
34420 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34421 *total = cost->add * 2;
34422 else
34423 *total = cost->add;
34424 return false;
34426 case COMPARE:
34427 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
34428 && XEXP (XEXP (x, 0), 1) == const1_rtx
34429 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
34430 && XEXP (x, 1) == const0_rtx)
34432 /* This kind of construct is implemented using test[bwl].
34433 Treat it as if we had an AND. */
34434 *total = (cost->add
34435 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
34436 + rtx_cost (const1_rtx, outer_code, opno, speed));
34437 return true;
34439 return false;
34441 case FLOAT_EXTEND:
34442 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
34443 *total = 0;
34444 return false;
34446 case ABS:
34447 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34448 /* ??? SSE cost should be used here. */
34449 *total = cost->fabs;
34450 else if (X87_FLOAT_MODE_P (mode))
34451 *total = cost->fabs;
34452 else if (FLOAT_MODE_P (mode))
34453 /* ??? SSE vector cost should be used here. */
34454 *total = cost->fabs;
34455 return false;
34457 case SQRT:
34458 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34459 /* ??? SSE cost should be used here. */
34460 *total = cost->fsqrt;
34461 else if (X87_FLOAT_MODE_P (mode))
34462 *total = cost->fsqrt;
34463 else if (FLOAT_MODE_P (mode))
34464 /* ??? SSE vector cost should be used here. */
34465 *total = cost->fsqrt;
34466 return false;
34468 case UNSPEC:
34469 if (XINT (x, 1) == UNSPEC_TP)
34470 *total = 0;
34471 return false;
34473 case VEC_SELECT:
34474 case VEC_CONCAT:
34475 case VEC_MERGE:
34476 case VEC_DUPLICATE:
34477 /* ??? Assume all of these vector manipulation patterns are
34478 recognizable. In which case they all pretty much have the
34479 same cost. */
34480 *total = cost->fabs;
34481 return true;
34483 default:
34484 return false;
34488 #if TARGET_MACHO
34490 static int current_machopic_label_num;
34492 /* Given a symbol name and its associated stub, write out the
34493 definition of the stub. */
34495 void
34496 machopic_output_stub (FILE *file, const char *symb, const char *stub)
34498 unsigned int length;
34499 char *binder_name, *symbol_name, lazy_ptr_name[32];
34500 int label = ++current_machopic_label_num;
34502 /* For 64-bit we shouldn't get here. */
34503 gcc_assert (!TARGET_64BIT);
34505 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
34506 symb = targetm.strip_name_encoding (symb);
34508 length = strlen (stub);
34509 binder_name = XALLOCAVEC (char, length + 32);
34510 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
34512 length = strlen (symb);
34513 symbol_name = XALLOCAVEC (char, length + 32);
34514 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
34516 sprintf (lazy_ptr_name, "L%d$lz", label);
34518 if (MACHOPIC_ATT_STUB)
34519 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
34520 else if (MACHOPIC_PURE)
34521 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
34522 else
34523 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
34525 fprintf (file, "%s:\n", stub);
34526 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
34528 if (MACHOPIC_ATT_STUB)
34530 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
34532 else if (MACHOPIC_PURE)
34534 /* PIC stub. */
34535 /* 25-byte PIC stub using "CALL get_pc_thunk". */
34536 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
34537 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
34538 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
34539 label, lazy_ptr_name, label);
34540 fprintf (file, "\tjmp\t*%%ecx\n");
34542 else
34543 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
34545 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
34546 it needs no stub-binding-helper. */
34547 if (MACHOPIC_ATT_STUB)
34548 return;
34550 fprintf (file, "%s:\n", binder_name);
34552 if (MACHOPIC_PURE)
34554 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
34555 fprintf (file, "\tpushl\t%%ecx\n");
34557 else
34558 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
34560 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
34562 /* N.B. Keep the correspondence of these
34563 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
34564 old-pic/new-pic/non-pic stubs; altering this will break
34565 compatibility with existing dylibs. */
34566 if (MACHOPIC_PURE)
34568 /* 25-byte PIC stub using "CALL get_pc_thunk". */
34569 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
34571 else
34572 /* 16-byte -mdynamic-no-pic stub. */
34573 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
34575 fprintf (file, "%s:\n", lazy_ptr_name);
34576 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
34577 fprintf (file, ASM_LONG "%s\n", binder_name);
34579 #endif /* TARGET_MACHO */
34581 /* Order the registers for register allocator. */
34583 void
34584 x86_order_regs_for_local_alloc (void)
34586 int pos = 0;
34587 int i;
34589 /* First allocate the local general purpose registers. */
34590 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
34591 if (GENERAL_REGNO_P (i) && call_used_regs[i])
34592 reg_alloc_order [pos++] = i;
34594 /* Global general purpose registers. */
34595 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
34596 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
34597 reg_alloc_order [pos++] = i;
34599 /* x87 registers come first in case we are doing FP math
34600 using them. */
34601 if (!TARGET_SSE_MATH)
34602 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
34603 reg_alloc_order [pos++] = i;
34605 /* SSE registers. */
34606 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
34607 reg_alloc_order [pos++] = i;
34608 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
34609 reg_alloc_order [pos++] = i;
34611 /* x87 registers. */
34612 if (TARGET_SSE_MATH)
34613 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
34614 reg_alloc_order [pos++] = i;
34616 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
34617 reg_alloc_order [pos++] = i;
34619 /* Initialize the rest of array as we do not allocate some registers
34620 at all. */
34621 while (pos < FIRST_PSEUDO_REGISTER)
34622 reg_alloc_order [pos++] = 0;
34625 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
34626 in struct attribute_spec handler. */
34627 static tree
34628 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
34629 tree args,
34630 int flags ATTRIBUTE_UNUSED,
34631 bool *no_add_attrs)
34633 if (TREE_CODE (*node) != FUNCTION_TYPE
34634 && TREE_CODE (*node) != METHOD_TYPE
34635 && TREE_CODE (*node) != FIELD_DECL
34636 && TREE_CODE (*node) != TYPE_DECL)
34638 warning (OPT_Wattributes, "%qE attribute only applies to functions",
34639 name);
34640 *no_add_attrs = true;
34641 return NULL_TREE;
34643 if (TARGET_64BIT)
34645 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
34646 name);
34647 *no_add_attrs = true;
34648 return NULL_TREE;
34650 if (is_attribute_p ("callee_pop_aggregate_return", name))
34652 tree cst;
34654 cst = TREE_VALUE (args);
34655 if (TREE_CODE (cst) != INTEGER_CST)
34657 warning (OPT_Wattributes,
34658 "%qE attribute requires an integer constant argument",
34659 name);
34660 *no_add_attrs = true;
34662 else if (compare_tree_int (cst, 0) != 0
34663 && compare_tree_int (cst, 1) != 0)
34665 warning (OPT_Wattributes,
34666 "argument to %qE attribute is neither zero, nor one",
34667 name);
34668 *no_add_attrs = true;
34671 return NULL_TREE;
34674 return NULL_TREE;
34677 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
34678 struct attribute_spec.handler. */
34679 static tree
34680 ix86_handle_abi_attribute (tree *node, tree name,
34681 tree args ATTRIBUTE_UNUSED,
34682 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
34684 if (TREE_CODE (*node) != FUNCTION_TYPE
34685 && TREE_CODE (*node) != METHOD_TYPE
34686 && TREE_CODE (*node) != FIELD_DECL
34687 && TREE_CODE (*node) != TYPE_DECL)
34689 warning (OPT_Wattributes, "%qE attribute only applies to functions",
34690 name);
34691 *no_add_attrs = true;
34692 return NULL_TREE;
34695 /* Can combine regparm with all attributes but fastcall. */
34696 if (is_attribute_p ("ms_abi", name))
34698 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
34700 error ("ms_abi and sysv_abi attributes are not compatible");
34703 return NULL_TREE;
34705 else if (is_attribute_p ("sysv_abi", name))
34707 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
34709 error ("ms_abi and sysv_abi attributes are not compatible");
34712 return NULL_TREE;
34715 return NULL_TREE;
34718 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
34719 struct attribute_spec.handler. */
34720 static tree
34721 ix86_handle_struct_attribute (tree *node, tree name,
34722 tree args ATTRIBUTE_UNUSED,
34723 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
34725 tree *type = NULL;
34726 if (DECL_P (*node))
34728 if (TREE_CODE (*node) == TYPE_DECL)
34729 type = &TREE_TYPE (*node);
34731 else
34732 type = node;
34734 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
34736 warning (OPT_Wattributes, "%qE attribute ignored",
34737 name);
34738 *no_add_attrs = true;
34741 else if ((is_attribute_p ("ms_struct", name)
34742 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
34743 || ((is_attribute_p ("gcc_struct", name)
34744 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
34746 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
34747 name);
34748 *no_add_attrs = true;
34751 return NULL_TREE;
34754 static tree
34755 ix86_handle_fndecl_attribute (tree *node, tree name,
34756 tree args ATTRIBUTE_UNUSED,
34757 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
34759 if (TREE_CODE (*node) != FUNCTION_DECL)
34761 warning (OPT_Wattributes, "%qE attribute only applies to functions",
34762 name);
34763 *no_add_attrs = true;
34765 return NULL_TREE;
34768 static bool
34769 ix86_ms_bitfield_layout_p (const_tree record_type)
34771 return ((TARGET_MS_BITFIELD_LAYOUT
34772 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
34773 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
34776 /* Returns an expression indicating where the this parameter is
34777 located on entry to the FUNCTION. */
34779 static rtx
34780 x86_this_parameter (tree function)
34782 tree type = TREE_TYPE (function);
34783 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
34784 int nregs;
34786 if (TARGET_64BIT)
34788 const int *parm_regs;
34790 if (ix86_function_type_abi (type) == MS_ABI)
34791 parm_regs = x86_64_ms_abi_int_parameter_registers;
34792 else
34793 parm_regs = x86_64_int_parameter_registers;
34794 return gen_rtx_REG (Pmode, parm_regs[aggr]);
34797 nregs = ix86_function_regparm (type, function);
34799 if (nregs > 0 && !stdarg_p (type))
34801 int regno;
34802 unsigned int ccvt = ix86_get_callcvt (type);
34804 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
34805 regno = aggr ? DX_REG : CX_REG;
34806 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
34808 regno = CX_REG;
34809 if (aggr)
34810 return gen_rtx_MEM (SImode,
34811 plus_constant (Pmode, stack_pointer_rtx, 4));
34813 else
34815 regno = AX_REG;
34816 if (aggr)
34818 regno = DX_REG;
34819 if (nregs == 1)
34820 return gen_rtx_MEM (SImode,
34821 plus_constant (Pmode,
34822 stack_pointer_rtx, 4));
34825 return gen_rtx_REG (SImode, regno);
34828 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
34829 aggr ? 8 : 4));
34832 /* Determine whether x86_output_mi_thunk can succeed. */
34834 static bool
34835 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
34836 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
34837 HOST_WIDE_INT vcall_offset, const_tree function)
34839 /* 64-bit can handle anything. */
34840 if (TARGET_64BIT)
34841 return true;
34843 /* For 32-bit, everything's fine if we have one free register. */
34844 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
34845 return true;
34847 /* Need a free register for vcall_offset. */
34848 if (vcall_offset)
34849 return false;
34851 /* Need a free register for GOT references. */
34852 if (flag_pic && !targetm.binds_local_p (function))
34853 return false;
34855 /* Otherwise ok. */
34856 return true;
34859 /* Output the assembler code for a thunk function. THUNK_DECL is the
34860 declaration for the thunk function itself, FUNCTION is the decl for
34861 the target function. DELTA is an immediate constant offset to be
34862 added to THIS. If VCALL_OFFSET is nonzero, the word at
34863 *(*this + vcall_offset) should be added to THIS. */
34865 static void
34866 x86_output_mi_thunk (FILE *file,
34867 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
34868 HOST_WIDE_INT vcall_offset, tree function)
34870 rtx this_param = x86_this_parameter (function);
34871 rtx this_reg, tmp, fnaddr;
34872 unsigned int tmp_regno;
34874 if (TARGET_64BIT)
34875 tmp_regno = R10_REG;
34876 else
34878 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
34879 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
34880 tmp_regno = AX_REG;
34881 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
34882 tmp_regno = DX_REG;
34883 else
34884 tmp_regno = CX_REG;
34887 emit_note (NOTE_INSN_PROLOGUE_END);
34889 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
34890 pull it in now and let DELTA benefit. */
34891 if (REG_P (this_param))
34892 this_reg = this_param;
34893 else if (vcall_offset)
34895 /* Put the this parameter into %eax. */
34896 this_reg = gen_rtx_REG (Pmode, AX_REG);
34897 emit_move_insn (this_reg, this_param);
34899 else
34900 this_reg = NULL_RTX;
34902 /* Adjust the this parameter by a fixed constant. */
34903 if (delta)
34905 rtx delta_rtx = GEN_INT (delta);
34906 rtx delta_dst = this_reg ? this_reg : this_param;
34908 if (TARGET_64BIT)
34910 if (!x86_64_general_operand (delta_rtx, Pmode))
34912 tmp = gen_rtx_REG (Pmode, tmp_regno);
34913 emit_move_insn (tmp, delta_rtx);
34914 delta_rtx = tmp;
34918 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
34921 /* Adjust the this parameter by a value stored in the vtable. */
34922 if (vcall_offset)
34924 rtx vcall_addr, vcall_mem, this_mem;
34926 tmp = gen_rtx_REG (Pmode, tmp_regno);
34928 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
34929 if (Pmode != ptr_mode)
34930 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
34931 emit_move_insn (tmp, this_mem);
34933 /* Adjust the this parameter. */
34934 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
34935 if (TARGET_64BIT
34936 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
34938 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
34939 emit_move_insn (tmp2, GEN_INT (vcall_offset));
34940 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
34943 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
34944 if (Pmode != ptr_mode)
34945 emit_insn (gen_addsi_1_zext (this_reg,
34946 gen_rtx_REG (ptr_mode,
34947 REGNO (this_reg)),
34948 vcall_mem));
34949 else
34950 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
34953 /* If necessary, drop THIS back to its stack slot. */
34954 if (this_reg && this_reg != this_param)
34955 emit_move_insn (this_param, this_reg);
34957 fnaddr = XEXP (DECL_RTL (function), 0);
34958 if (TARGET_64BIT)
34960 if (!flag_pic || targetm.binds_local_p (function)
34961 || cfun->machine->call_abi == MS_ABI)
34963 else
34965 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
34966 tmp = gen_rtx_CONST (Pmode, tmp);
34967 fnaddr = gen_rtx_MEM (Pmode, tmp);
34970 else
34972 if (!flag_pic || targetm.binds_local_p (function))
34974 #if TARGET_MACHO
34975 else if (TARGET_MACHO)
34977 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
34978 fnaddr = XEXP (fnaddr, 0);
34980 #endif /* TARGET_MACHO */
34981 else
34983 tmp = gen_rtx_REG (Pmode, CX_REG);
34984 output_set_got (tmp, NULL_RTX);
34986 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
34987 fnaddr = gen_rtx_PLUS (Pmode, fnaddr, tmp);
34988 fnaddr = gen_rtx_MEM (Pmode, fnaddr);
34992 /* Our sibling call patterns do not allow memories, because we have no
34993 predicate that can distinguish between frame and non-frame memory.
34994 For our purposes here, we can get away with (ab)using a jump pattern,
34995 because we're going to do no optimization. */
34996 if (MEM_P (fnaddr))
34997 emit_jump_insn (gen_indirect_jump (fnaddr));
34998 else
35000 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
35001 fnaddr = legitimize_pic_address (fnaddr,
35002 gen_rtx_REG (Pmode, tmp_regno));
35004 if (!sibcall_insn_operand (fnaddr, word_mode))
35006 tmp = gen_rtx_REG (word_mode, tmp_regno);
35007 if (GET_MODE (fnaddr) != word_mode)
35008 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
35009 emit_move_insn (tmp, fnaddr);
35010 fnaddr = tmp;
35013 tmp = gen_rtx_MEM (QImode, fnaddr);
35014 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
35015 tmp = emit_call_insn (tmp);
35016 SIBLING_CALL_P (tmp) = 1;
35018 emit_barrier ();
35020 /* Emit just enough of rest_of_compilation to get the insns emitted.
35021 Note that use_thunk calls assemble_start_function et al. */
35022 tmp = get_insns ();
35023 shorten_branches (tmp);
35024 final_start_function (tmp, file, 1);
35025 final (tmp, file, 1);
35026 final_end_function ();
35029 static void
35030 x86_file_start (void)
35032 default_file_start ();
35033 #if TARGET_MACHO
35034 darwin_file_start ();
35035 #endif
35036 if (X86_FILE_START_VERSION_DIRECTIVE)
35037 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
35038 if (X86_FILE_START_FLTUSED)
35039 fputs ("\t.global\t__fltused\n", asm_out_file);
35040 if (ix86_asm_dialect == ASM_INTEL)
35041 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
35045 x86_field_alignment (tree field, int computed)
35047 enum machine_mode mode;
35048 tree type = TREE_TYPE (field);
35050 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
35051 return computed;
35052 mode = TYPE_MODE (strip_array_types (type));
35053 if (mode == DFmode || mode == DCmode
35054 || GET_MODE_CLASS (mode) == MODE_INT
35055 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
35056 return MIN (32, computed);
35057 return computed;
35060 /* Output assembler code to FILE to increment profiler label # LABELNO
35061 for profiling a function entry. */
35062 void
35063 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
35065 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
35066 : MCOUNT_NAME);
35068 if (TARGET_64BIT)
35070 #ifndef NO_PROFILE_COUNTERS
35071 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
35072 #endif
35074 if (DEFAULT_ABI == SYSV_ABI && flag_pic)
35075 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
35076 else
35077 fprintf (file, "\tcall\t%s\n", mcount_name);
35079 else if (flag_pic)
35081 #ifndef NO_PROFILE_COUNTERS
35082 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
35083 LPREFIX, labelno);
35084 #endif
35085 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
35087 else
35089 #ifndef NO_PROFILE_COUNTERS
35090 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
35091 LPREFIX, labelno);
35092 #endif
35093 fprintf (file, "\tcall\t%s\n", mcount_name);
35097 /* We don't have exact information about the insn sizes, but we may assume
35098 quite safely that we are informed about all 1 byte insns and memory
35099 address sizes. This is enough to eliminate unnecessary padding in
35100 99% of cases. */
35102 static int
35103 min_insn_size (rtx insn)
35105 int l = 0, len;
35107 if (!INSN_P (insn) || !active_insn_p (insn))
35108 return 0;
35110 /* Discard alignments we've emit and jump instructions. */
35111 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
35112 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
35113 return 0;
35114 if (JUMP_TABLE_DATA_P (insn))
35115 return 0;
35117 /* Important case - calls are always 5 bytes.
35118 It is common to have many calls in the row. */
35119 if (CALL_P (insn)
35120 && symbolic_reference_mentioned_p (PATTERN (insn))
35121 && !SIBLING_CALL_P (insn))
35122 return 5;
35123 len = get_attr_length (insn);
35124 if (len <= 1)
35125 return 1;
35127 /* For normal instructions we rely on get_attr_length being exact,
35128 with a few exceptions. */
35129 if (!JUMP_P (insn))
35131 enum attr_type type = get_attr_type (insn);
35133 switch (type)
35135 case TYPE_MULTI:
35136 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
35137 || asm_noperands (PATTERN (insn)) >= 0)
35138 return 0;
35139 break;
35140 case TYPE_OTHER:
35141 case TYPE_FCMP:
35142 break;
35143 default:
35144 /* Otherwise trust get_attr_length. */
35145 return len;
35148 l = get_attr_length_address (insn);
35149 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
35150 l = 4;
35152 if (l)
35153 return 1+l;
35154 else
35155 return 2;
35158 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
35160 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
35161 window. */
35163 static void
35164 ix86_avoid_jump_mispredicts (void)
35166 rtx insn, start = get_insns ();
35167 int nbytes = 0, njumps = 0;
35168 int isjump = 0;
35170 /* Look for all minimal intervals of instructions containing 4 jumps.
35171 The intervals are bounded by START and INSN. NBYTES is the total
35172 size of instructions in the interval including INSN and not including
35173 START. When the NBYTES is smaller than 16 bytes, it is possible
35174 that the end of START and INSN ends up in the same 16byte page.
35176 The smallest offset in the page INSN can start is the case where START
35177 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
35178 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
35180 for (insn = start; insn; insn = NEXT_INSN (insn))
35182 int min_size;
35184 if (LABEL_P (insn))
35186 int align = label_to_alignment (insn);
35187 int max_skip = label_to_max_skip (insn);
35189 if (max_skip > 15)
35190 max_skip = 15;
35191 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
35192 already in the current 16 byte page, because otherwise
35193 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
35194 bytes to reach 16 byte boundary. */
35195 if (align <= 0
35196 || (align <= 3 && max_skip != (1 << align) - 1))
35197 max_skip = 0;
35198 if (dump_file)
35199 fprintf (dump_file, "Label %i with max_skip %i\n",
35200 INSN_UID (insn), max_skip);
35201 if (max_skip)
35203 while (nbytes + max_skip >= 16)
35205 start = NEXT_INSN (start);
35206 if ((JUMP_P (start)
35207 && GET_CODE (PATTERN (start)) != ADDR_VEC
35208 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
35209 || CALL_P (start))
35210 njumps--, isjump = 1;
35211 else
35212 isjump = 0;
35213 nbytes -= min_insn_size (start);
35216 continue;
35219 min_size = min_insn_size (insn);
35220 nbytes += min_size;
35221 if (dump_file)
35222 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
35223 INSN_UID (insn), min_size);
35224 if ((JUMP_P (insn)
35225 && GET_CODE (PATTERN (insn)) != ADDR_VEC
35226 && GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC)
35227 || CALL_P (insn))
35228 njumps++;
35229 else
35230 continue;
35232 while (njumps > 3)
35234 start = NEXT_INSN (start);
35235 if ((JUMP_P (start)
35236 && GET_CODE (PATTERN (start)) != ADDR_VEC
35237 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
35238 || CALL_P (start))
35239 njumps--, isjump = 1;
35240 else
35241 isjump = 0;
35242 nbytes -= min_insn_size (start);
35244 gcc_assert (njumps >= 0);
35245 if (dump_file)
35246 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
35247 INSN_UID (start), INSN_UID (insn), nbytes);
35249 if (njumps == 3 && isjump && nbytes < 16)
35251 int padsize = 15 - nbytes + min_insn_size (insn);
35253 if (dump_file)
35254 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
35255 INSN_UID (insn), padsize);
35256 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
35260 #endif
35262 /* AMD Athlon works faster
35263 when RET is not destination of conditional jump or directly preceded
35264 by other jump instruction. We avoid the penalty by inserting NOP just
35265 before the RET instructions in such cases. */
35266 static void
35267 ix86_pad_returns (void)
35269 edge e;
35270 edge_iterator ei;
35272 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
35274 basic_block bb = e->src;
35275 rtx ret = BB_END (bb);
35276 rtx prev;
35277 bool replace = false;
35279 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
35280 || optimize_bb_for_size_p (bb))
35281 continue;
35282 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
35283 if (active_insn_p (prev) || LABEL_P (prev))
35284 break;
35285 if (prev && LABEL_P (prev))
35287 edge e;
35288 edge_iterator ei;
35290 FOR_EACH_EDGE (e, ei, bb->preds)
35291 if (EDGE_FREQUENCY (e) && e->src->index >= 0
35292 && !(e->flags & EDGE_FALLTHRU))
35293 replace = true;
35295 if (!replace)
35297 prev = prev_active_insn (ret);
35298 if (prev
35299 && ((JUMP_P (prev) && any_condjump_p (prev))
35300 || CALL_P (prev)))
35301 replace = true;
35302 /* Empty functions get branch mispredict even when
35303 the jump destination is not visible to us. */
35304 if (!prev && !optimize_function_for_size_p (cfun))
35305 replace = true;
35307 if (replace)
35309 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
35310 delete_insn (ret);
35315 /* Count the minimum number of instructions in BB. Return 4 if the
35316 number of instructions >= 4. */
35318 static int
35319 ix86_count_insn_bb (basic_block bb)
35321 rtx insn;
35322 int insn_count = 0;
35324 /* Count number of instructions in this block. Return 4 if the number
35325 of instructions >= 4. */
35326 FOR_BB_INSNS (bb, insn)
35328 /* Only happen in exit blocks. */
35329 if (JUMP_P (insn)
35330 && ANY_RETURN_P (PATTERN (insn)))
35331 break;
35333 if (NONDEBUG_INSN_P (insn)
35334 && GET_CODE (PATTERN (insn)) != USE
35335 && GET_CODE (PATTERN (insn)) != CLOBBER)
35337 insn_count++;
35338 if (insn_count >= 4)
35339 return insn_count;
35343 return insn_count;
35347 /* Count the minimum number of instructions in code path in BB.
35348 Return 4 if the number of instructions >= 4. */
35350 static int
35351 ix86_count_insn (basic_block bb)
35353 edge e;
35354 edge_iterator ei;
35355 int min_prev_count;
35357 /* Only bother counting instructions along paths with no
35358 more than 2 basic blocks between entry and exit. Given
35359 that BB has an edge to exit, determine if a predecessor
35360 of BB has an edge from entry. If so, compute the number
35361 of instructions in the predecessor block. If there
35362 happen to be multiple such blocks, compute the minimum. */
35363 min_prev_count = 4;
35364 FOR_EACH_EDGE (e, ei, bb->preds)
35366 edge prev_e;
35367 edge_iterator prev_ei;
35369 if (e->src == ENTRY_BLOCK_PTR)
35371 min_prev_count = 0;
35372 break;
35374 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
35376 if (prev_e->src == ENTRY_BLOCK_PTR)
35378 int count = ix86_count_insn_bb (e->src);
35379 if (count < min_prev_count)
35380 min_prev_count = count;
35381 break;
35386 if (min_prev_count < 4)
35387 min_prev_count += ix86_count_insn_bb (bb);
35389 return min_prev_count;
35392 /* Pad short function to 4 instructions. */
35394 static void
35395 ix86_pad_short_function (void)
35397 edge e;
35398 edge_iterator ei;
35400 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
35402 rtx ret = BB_END (e->src);
35403 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
35405 int insn_count = ix86_count_insn (e->src);
35407 /* Pad short function. */
35408 if (insn_count < 4)
35410 rtx insn = ret;
35412 /* Find epilogue. */
35413 while (insn
35414 && (!NOTE_P (insn)
35415 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
35416 insn = PREV_INSN (insn);
35418 if (!insn)
35419 insn = ret;
35421 /* Two NOPs count as one instruction. */
35422 insn_count = 2 * (4 - insn_count);
35423 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
35429 /* Implement machine specific optimizations. We implement padding of returns
35430 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
35431 static void
35432 ix86_reorg (void)
35434 /* We are freeing block_for_insn in the toplev to keep compatibility
35435 with old MDEP_REORGS that are not CFG based. Recompute it now. */
35436 compute_bb_for_insn ();
35438 if (optimize && optimize_function_for_speed_p (cfun))
35440 if (TARGET_PAD_SHORT_FUNCTION)
35441 ix86_pad_short_function ();
35442 else if (TARGET_PAD_RETURNS)
35443 ix86_pad_returns ();
35444 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
35445 if (TARGET_FOUR_JUMP_LIMIT)
35446 ix86_avoid_jump_mispredicts ();
35447 #endif
35451 /* Return nonzero when QImode register that must be represented via REX prefix
35452 is used. */
35453 bool
35454 x86_extended_QIreg_mentioned_p (rtx insn)
35456 int i;
35457 extract_insn_cached (insn);
35458 for (i = 0; i < recog_data.n_operands; i++)
35459 if (GENERAL_REG_P (recog_data.operand[i])
35460 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
35461 return true;
35462 return false;
35465 /* Return nonzero when P points to register encoded via REX prefix.
35466 Called via for_each_rtx. */
35467 static int
35468 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
35470 unsigned int regno;
35471 if (!REG_P (*p))
35472 return 0;
35473 regno = REGNO (*p);
35474 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
35477 /* Return true when INSN mentions register that must be encoded using REX
35478 prefix. */
35479 bool
35480 x86_extended_reg_mentioned_p (rtx insn)
35482 return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
35483 extended_reg_mentioned_1, NULL);
35486 /* If profitable, negate (without causing overflow) integer constant
35487 of mode MODE at location LOC. Return true in this case. */
35488 bool
35489 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
35491 HOST_WIDE_INT val;
35493 if (!CONST_INT_P (*loc))
35494 return false;
35496 switch (mode)
35498 case DImode:
35499 /* DImode x86_64 constants must fit in 32 bits. */
35500 gcc_assert (x86_64_immediate_operand (*loc, mode));
35502 mode = SImode;
35503 break;
35505 case SImode:
35506 case HImode:
35507 case QImode:
35508 break;
35510 default:
35511 gcc_unreachable ();
35514 /* Avoid overflows. */
35515 if (mode_signbit_p (mode, *loc))
35516 return false;
35518 val = INTVAL (*loc);
35520 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
35521 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
35522 if ((val < 0 && val != -128)
35523 || val == 128)
35525 *loc = GEN_INT (-val);
35526 return true;
35529 return false;
35532 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
35533 optabs would emit if we didn't have TFmode patterns. */
35535 void
35536 x86_emit_floatuns (rtx operands[2])
35538 rtx neglab, donelab, i0, i1, f0, in, out;
35539 enum machine_mode mode, inmode;
35541 inmode = GET_MODE (operands[1]);
35542 gcc_assert (inmode == SImode || inmode == DImode);
35544 out = operands[0];
35545 in = force_reg (inmode, operands[1]);
35546 mode = GET_MODE (out);
35547 neglab = gen_label_rtx ();
35548 donelab = gen_label_rtx ();
35549 f0 = gen_reg_rtx (mode);
35551 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
35553 expand_float (out, in, 0);
35555 emit_jump_insn (gen_jump (donelab));
35556 emit_barrier ();
35558 emit_label (neglab);
35560 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
35561 1, OPTAB_DIRECT);
35562 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
35563 1, OPTAB_DIRECT);
35564 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
35566 expand_float (f0, i0, 0);
35568 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
35570 emit_label (donelab);
35573 /* AVX2 does support 32-byte integer vector operations,
35574 thus the longest vector we are faced with is V32QImode. */
35575 #define MAX_VECT_LEN 32
35577 struct expand_vec_perm_d
35579 rtx target, op0, op1;
35580 unsigned char perm[MAX_VECT_LEN];
35581 enum machine_mode vmode;
35582 unsigned char nelt;
35583 bool one_operand_p;
35584 bool testing_p;
35587 static bool canonicalize_perm (struct expand_vec_perm_d *d);
35588 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
35589 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
35591 /* Get a vector mode of the same size as the original but with elements
35592 twice as wide. This is only guaranteed to apply to integral vectors. */
35594 static inline enum machine_mode
35595 get_mode_wider_vector (enum machine_mode o)
35597 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
35598 enum machine_mode n = GET_MODE_WIDER_MODE (o);
35599 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
35600 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
35601 return n;
35604 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
35605 with all elements equal to VAR. Return true if successful. */
35607 static bool
35608 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
35609 rtx target, rtx val)
35611 bool ok;
35613 switch (mode)
35615 case V2SImode:
35616 case V2SFmode:
35617 if (!mmx_ok)
35618 return false;
35619 /* FALLTHRU */
35621 case V4DFmode:
35622 case V4DImode:
35623 case V8SFmode:
35624 case V8SImode:
35625 case V2DFmode:
35626 case V2DImode:
35627 case V4SFmode:
35628 case V4SImode:
35630 rtx insn, dup;
35632 /* First attempt to recognize VAL as-is. */
35633 dup = gen_rtx_VEC_DUPLICATE (mode, val);
35634 insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
35635 if (recog_memoized (insn) < 0)
35637 rtx seq;
35638 /* If that fails, force VAL into a register. */
35640 start_sequence ();
35641 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
35642 seq = get_insns ();
35643 end_sequence ();
35644 if (seq)
35645 emit_insn_before (seq, insn);
35647 ok = recog_memoized (insn) >= 0;
35648 gcc_assert (ok);
35651 return true;
35653 case V4HImode:
35654 if (!mmx_ok)
35655 return false;
35656 if (TARGET_SSE || TARGET_3DNOW_A)
35658 rtx x;
35660 val = gen_lowpart (SImode, val);
35661 x = gen_rtx_TRUNCATE (HImode, val);
35662 x = gen_rtx_VEC_DUPLICATE (mode, x);
35663 emit_insn (gen_rtx_SET (VOIDmode, target, x));
35664 return true;
35666 goto widen;
35668 case V8QImode:
35669 if (!mmx_ok)
35670 return false;
35671 goto widen;
35673 case V8HImode:
35674 if (TARGET_SSE2)
35676 struct expand_vec_perm_d dperm;
35677 rtx tmp1, tmp2;
35679 permute:
35680 memset (&dperm, 0, sizeof (dperm));
35681 dperm.target = target;
35682 dperm.vmode = mode;
35683 dperm.nelt = GET_MODE_NUNITS (mode);
35684 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
35685 dperm.one_operand_p = true;
35687 /* Extend to SImode using a paradoxical SUBREG. */
35688 tmp1 = gen_reg_rtx (SImode);
35689 emit_move_insn (tmp1, gen_lowpart (SImode, val));
35691 /* Insert the SImode value as low element of a V4SImode vector. */
35692 tmp2 = gen_lowpart (V4SImode, dperm.op0);
35693 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
35695 ok = (expand_vec_perm_1 (&dperm)
35696 || expand_vec_perm_broadcast_1 (&dperm));
35697 gcc_assert (ok);
35698 return ok;
35700 goto widen;
35702 case V16QImode:
35703 if (TARGET_SSE2)
35704 goto permute;
35705 goto widen;
35707 widen:
35708 /* Replicate the value once into the next wider mode and recurse. */
35710 enum machine_mode smode, wsmode, wvmode;
35711 rtx x;
35713 smode = GET_MODE_INNER (mode);
35714 wvmode = get_mode_wider_vector (mode);
35715 wsmode = GET_MODE_INNER (wvmode);
35717 val = convert_modes (wsmode, smode, val, true);
35718 x = expand_simple_binop (wsmode, ASHIFT, val,
35719 GEN_INT (GET_MODE_BITSIZE (smode)),
35720 NULL_RTX, 1, OPTAB_LIB_WIDEN);
35721 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
35723 x = gen_lowpart (wvmode, target);
35724 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
35725 gcc_assert (ok);
35726 return ok;
35729 case V16HImode:
35730 case V32QImode:
35732 enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
35733 rtx x = gen_reg_rtx (hvmode);
35735 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
35736 gcc_assert (ok);
35738 x = gen_rtx_VEC_CONCAT (mode, x, x);
35739 emit_insn (gen_rtx_SET (VOIDmode, target, x));
35741 return true;
35743 default:
35744 return false;
35748 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
35749 whose ONE_VAR element is VAR, and other elements are zero. Return true
35750 if successful. */
35752 static bool
35753 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
35754 rtx target, rtx var, int one_var)
35756 enum machine_mode vsimode;
35757 rtx new_target;
35758 rtx x, tmp;
35759 bool use_vector_set = false;
35761 switch (mode)
35763 case V2DImode:
35764 /* For SSE4.1, we normally use vector set. But if the second
35765 element is zero and inter-unit moves are OK, we use movq
35766 instead. */
35767 use_vector_set = (TARGET_64BIT
35768 && TARGET_SSE4_1
35769 && !(TARGET_INTER_UNIT_MOVES
35770 && one_var == 0));
35771 break;
35772 case V16QImode:
35773 case V4SImode:
35774 case V4SFmode:
35775 use_vector_set = TARGET_SSE4_1;
35776 break;
35777 case V8HImode:
35778 use_vector_set = TARGET_SSE2;
35779 break;
35780 case V4HImode:
35781 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
35782 break;
35783 case V32QImode:
35784 case V16HImode:
35785 case V8SImode:
35786 case V8SFmode:
35787 case V4DFmode:
35788 use_vector_set = TARGET_AVX;
35789 break;
35790 case V4DImode:
35791 /* Use ix86_expand_vector_set in 64bit mode only. */
35792 use_vector_set = TARGET_AVX && TARGET_64BIT;
35793 break;
35794 default:
35795 break;
35798 if (use_vector_set)
35800 emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
35801 var = force_reg (GET_MODE_INNER (mode), var);
35802 ix86_expand_vector_set (mmx_ok, target, var, one_var);
35803 return true;
35806 switch (mode)
35808 case V2SFmode:
35809 case V2SImode:
35810 if (!mmx_ok)
35811 return false;
35812 /* FALLTHRU */
35814 case V2DFmode:
35815 case V2DImode:
35816 if (one_var != 0)
35817 return false;
35818 var = force_reg (GET_MODE_INNER (mode), var);
35819 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
35820 emit_insn (gen_rtx_SET (VOIDmode, target, x));
35821 return true;
35823 case V4SFmode:
35824 case V4SImode:
35825 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
35826 new_target = gen_reg_rtx (mode);
35827 else
35828 new_target = target;
35829 var = force_reg (GET_MODE_INNER (mode), var);
35830 x = gen_rtx_VEC_DUPLICATE (mode, var);
35831 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
35832 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
35833 if (one_var != 0)
35835 /* We need to shuffle the value to the correct position, so
35836 create a new pseudo to store the intermediate result. */
35838 /* With SSE2, we can use the integer shuffle insns. */
35839 if (mode != V4SFmode && TARGET_SSE2)
35841 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
35842 const1_rtx,
35843 GEN_INT (one_var == 1 ? 0 : 1),
35844 GEN_INT (one_var == 2 ? 0 : 1),
35845 GEN_INT (one_var == 3 ? 0 : 1)));
35846 if (target != new_target)
35847 emit_move_insn (target, new_target);
35848 return true;
35851 /* Otherwise convert the intermediate result to V4SFmode and
35852 use the SSE1 shuffle instructions. */
35853 if (mode != V4SFmode)
35855 tmp = gen_reg_rtx (V4SFmode);
35856 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
35858 else
35859 tmp = new_target;
35861 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
35862 const1_rtx,
35863 GEN_INT (one_var == 1 ? 0 : 1),
35864 GEN_INT (one_var == 2 ? 0+4 : 1+4),
35865 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
35867 if (mode != V4SFmode)
35868 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
35869 else if (tmp != target)
35870 emit_move_insn (target, tmp);
35872 else if (target != new_target)
35873 emit_move_insn (target, new_target);
35874 return true;
35876 case V8HImode:
35877 case V16QImode:
35878 vsimode = V4SImode;
35879 goto widen;
35880 case V4HImode:
35881 case V8QImode:
35882 if (!mmx_ok)
35883 return false;
35884 vsimode = V2SImode;
35885 goto widen;
35886 widen:
35887 if (one_var != 0)
35888 return false;
35890 /* Zero extend the variable element to SImode and recurse. */
35891 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
35893 x = gen_reg_rtx (vsimode);
35894 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
35895 var, one_var))
35896 gcc_unreachable ();
35898 emit_move_insn (target, gen_lowpart (mode, x));
35899 return true;
35901 default:
35902 return false;
35906 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
35907 consisting of the values in VALS. It is known that all elements
35908 except ONE_VAR are constants. Return true if successful. */
35910 static bool
35911 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
35912 rtx target, rtx vals, int one_var)
35914 rtx var = XVECEXP (vals, 0, one_var);
35915 enum machine_mode wmode;
35916 rtx const_vec, x;
35918 const_vec = copy_rtx (vals);
35919 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
35920 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
35922 switch (mode)
35924 case V2DFmode:
35925 case V2DImode:
35926 case V2SFmode:
35927 case V2SImode:
35928 /* For the two element vectors, it's just as easy to use
35929 the general case. */
35930 return false;
35932 case V4DImode:
35933 /* Use ix86_expand_vector_set in 64bit mode only. */
35934 if (!TARGET_64BIT)
35935 return false;
35936 case V4DFmode:
35937 case V8SFmode:
35938 case V8SImode:
35939 case V16HImode:
35940 case V32QImode:
35941 case V4SFmode:
35942 case V4SImode:
35943 case V8HImode:
35944 case V4HImode:
35945 break;
35947 case V16QImode:
35948 if (TARGET_SSE4_1)
35949 break;
35950 wmode = V8HImode;
35951 goto widen;
35952 case V8QImode:
35953 wmode = V4HImode;
35954 goto widen;
35955 widen:
35956 /* There's no way to set one QImode entry easily. Combine
35957 the variable value with its adjacent constant value, and
35958 promote to an HImode set. */
35959 x = XVECEXP (vals, 0, one_var ^ 1);
35960 if (one_var & 1)
35962 var = convert_modes (HImode, QImode, var, true);
35963 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
35964 NULL_RTX, 1, OPTAB_LIB_WIDEN);
35965 x = GEN_INT (INTVAL (x) & 0xff);
35967 else
35969 var = convert_modes (HImode, QImode, var, true);
35970 x = gen_int_mode (INTVAL (x) << 8, HImode);
35972 if (x != const0_rtx)
35973 var = expand_simple_binop (HImode, IOR, var, x, var,
35974 1, OPTAB_LIB_WIDEN);
35976 x = gen_reg_rtx (wmode);
35977 emit_move_insn (x, gen_lowpart (wmode, const_vec));
35978 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
35980 emit_move_insn (target, gen_lowpart (mode, x));
35981 return true;
35983 default:
35984 return false;
35987 emit_move_insn (target, const_vec);
35988 ix86_expand_vector_set (mmx_ok, target, var, one_var);
35989 return true;
35992 /* A subroutine of ix86_expand_vector_init_general. Use vector
35993 concatenate to handle the most general case: all values variable,
35994 and none identical. */
35996 static void
35997 ix86_expand_vector_init_concat (enum machine_mode mode,
35998 rtx target, rtx *ops, int n)
36000 enum machine_mode cmode, hmode = VOIDmode;
36001 rtx first[8], second[4];
36002 rtvec v;
36003 int i, j;
36005 switch (n)
36007 case 2:
36008 switch (mode)
36010 case V8SImode:
36011 cmode = V4SImode;
36012 break;
36013 case V8SFmode:
36014 cmode = V4SFmode;
36015 break;
36016 case V4DImode:
36017 cmode = V2DImode;
36018 break;
36019 case V4DFmode:
36020 cmode = V2DFmode;
36021 break;
36022 case V4SImode:
36023 cmode = V2SImode;
36024 break;
36025 case V4SFmode:
36026 cmode = V2SFmode;
36027 break;
36028 case V2DImode:
36029 cmode = DImode;
36030 break;
36031 case V2SImode:
36032 cmode = SImode;
36033 break;
36034 case V2DFmode:
36035 cmode = DFmode;
36036 break;
36037 case V2SFmode:
36038 cmode = SFmode;
36039 break;
36040 default:
36041 gcc_unreachable ();
36044 if (!register_operand (ops[1], cmode))
36045 ops[1] = force_reg (cmode, ops[1]);
36046 if (!register_operand (ops[0], cmode))
36047 ops[0] = force_reg (cmode, ops[0]);
36048 emit_insn (gen_rtx_SET (VOIDmode, target,
36049 gen_rtx_VEC_CONCAT (mode, ops[0],
36050 ops[1])));
36051 break;
36053 case 4:
36054 switch (mode)
36056 case V4DImode:
36057 cmode = V2DImode;
36058 break;
36059 case V4DFmode:
36060 cmode = V2DFmode;
36061 break;
36062 case V4SImode:
36063 cmode = V2SImode;
36064 break;
36065 case V4SFmode:
36066 cmode = V2SFmode;
36067 break;
36068 default:
36069 gcc_unreachable ();
36071 goto half;
36073 case 8:
36074 switch (mode)
36076 case V8SImode:
36077 cmode = V2SImode;
36078 hmode = V4SImode;
36079 break;
36080 case V8SFmode:
36081 cmode = V2SFmode;
36082 hmode = V4SFmode;
36083 break;
36084 default:
36085 gcc_unreachable ();
36087 goto half;
36089 half:
36090 /* FIXME: We process inputs backward to help RA. PR 36222. */
36091 i = n - 1;
36092 j = (n >> 1) - 1;
36093 for (; i > 0; i -= 2, j--)
36095 first[j] = gen_reg_rtx (cmode);
36096 v = gen_rtvec (2, ops[i - 1], ops[i]);
36097 ix86_expand_vector_init (false, first[j],
36098 gen_rtx_PARALLEL (cmode, v));
36101 n >>= 1;
36102 if (n > 2)
36104 gcc_assert (hmode != VOIDmode);
36105 for (i = j = 0; i < n; i += 2, j++)
36107 second[j] = gen_reg_rtx (hmode);
36108 ix86_expand_vector_init_concat (hmode, second [j],
36109 &first [i], 2);
36111 n >>= 1;
36112 ix86_expand_vector_init_concat (mode, target, second, n);
36114 else
36115 ix86_expand_vector_init_concat (mode, target, first, n);
36116 break;
36118 default:
36119 gcc_unreachable ();
36123 /* A subroutine of ix86_expand_vector_init_general. Use vector
36124 interleave to handle the most general case: all values variable,
36125 and none identical. */
36127 static void
36128 ix86_expand_vector_init_interleave (enum machine_mode mode,
36129 rtx target, rtx *ops, int n)
36131 enum machine_mode first_imode, second_imode, third_imode, inner_mode;
36132 int i, j;
36133 rtx op0, op1;
36134 rtx (*gen_load_even) (rtx, rtx, rtx);
36135 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
36136 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
36138 switch (mode)
36140 case V8HImode:
36141 gen_load_even = gen_vec_setv8hi;
36142 gen_interleave_first_low = gen_vec_interleave_lowv4si;
36143 gen_interleave_second_low = gen_vec_interleave_lowv2di;
36144 inner_mode = HImode;
36145 first_imode = V4SImode;
36146 second_imode = V2DImode;
36147 third_imode = VOIDmode;
36148 break;
36149 case V16QImode:
36150 gen_load_even = gen_vec_setv16qi;
36151 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
36152 gen_interleave_second_low = gen_vec_interleave_lowv4si;
36153 inner_mode = QImode;
36154 first_imode = V8HImode;
36155 second_imode = V4SImode;
36156 third_imode = V2DImode;
36157 break;
36158 default:
36159 gcc_unreachable ();
36162 for (i = 0; i < n; i++)
36164 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
36165 op0 = gen_reg_rtx (SImode);
36166 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
36168 /* Insert the SImode value as low element of V4SImode vector. */
36169 op1 = gen_reg_rtx (V4SImode);
36170 op0 = gen_rtx_VEC_MERGE (V4SImode,
36171 gen_rtx_VEC_DUPLICATE (V4SImode,
36172 op0),
36173 CONST0_RTX (V4SImode),
36174 const1_rtx);
36175 emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
36177 /* Cast the V4SImode vector back to a vector in orignal mode. */
36178 op0 = gen_reg_rtx (mode);
36179 emit_move_insn (op0, gen_lowpart (mode, op1));
36181 /* Load even elements into the second positon. */
36182 emit_insn (gen_load_even (op0,
36183 force_reg (inner_mode,
36184 ops [i + i + 1]),
36185 const1_rtx));
36187 /* Cast vector to FIRST_IMODE vector. */
36188 ops[i] = gen_reg_rtx (first_imode);
36189 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
36192 /* Interleave low FIRST_IMODE vectors. */
36193 for (i = j = 0; i < n; i += 2, j++)
36195 op0 = gen_reg_rtx (first_imode);
36196 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
36198 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
36199 ops[j] = gen_reg_rtx (second_imode);
36200 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
36203 /* Interleave low SECOND_IMODE vectors. */
36204 switch (second_imode)
36206 case V4SImode:
36207 for (i = j = 0; i < n / 2; i += 2, j++)
36209 op0 = gen_reg_rtx (second_imode);
36210 emit_insn (gen_interleave_second_low (op0, ops[i],
36211 ops[i + 1]));
36213 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
36214 vector. */
36215 ops[j] = gen_reg_rtx (third_imode);
36216 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
36218 second_imode = V2DImode;
36219 gen_interleave_second_low = gen_vec_interleave_lowv2di;
36220 /* FALLTHRU */
36222 case V2DImode:
36223 op0 = gen_reg_rtx (second_imode);
36224 emit_insn (gen_interleave_second_low (op0, ops[0],
36225 ops[1]));
36227 /* Cast the SECOND_IMODE vector back to a vector on original
36228 mode. */
36229 emit_insn (gen_rtx_SET (VOIDmode, target,
36230 gen_lowpart (mode, op0)));
36231 break;
36233 default:
36234 gcc_unreachable ();
36238 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
36239 all values variable, and none identical. */
36241 static void
36242 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
36243 rtx target, rtx vals)
36245 rtx ops[32], op0, op1;
36246 enum machine_mode half_mode = VOIDmode;
36247 int n, i;
36249 switch (mode)
36251 case V2SFmode:
36252 case V2SImode:
36253 if (!mmx_ok && !TARGET_SSE)
36254 break;
36255 /* FALLTHRU */
36257 case V8SFmode:
36258 case V8SImode:
36259 case V4DFmode:
36260 case V4DImode:
36261 case V4SFmode:
36262 case V4SImode:
36263 case V2DFmode:
36264 case V2DImode:
36265 n = GET_MODE_NUNITS (mode);
36266 for (i = 0; i < n; i++)
36267 ops[i] = XVECEXP (vals, 0, i);
36268 ix86_expand_vector_init_concat (mode, target, ops, n);
36269 return;
36271 case V32QImode:
36272 half_mode = V16QImode;
36273 goto half;
36275 case V16HImode:
36276 half_mode = V8HImode;
36277 goto half;
36279 half:
36280 n = GET_MODE_NUNITS (mode);
36281 for (i = 0; i < n; i++)
36282 ops[i] = XVECEXP (vals, 0, i);
36283 op0 = gen_reg_rtx (half_mode);
36284 op1 = gen_reg_rtx (half_mode);
36285 ix86_expand_vector_init_interleave (half_mode, op0, ops,
36286 n >> 2);
36287 ix86_expand_vector_init_interleave (half_mode, op1,
36288 &ops [n >> 1], n >> 2);
36289 emit_insn (gen_rtx_SET (VOIDmode, target,
36290 gen_rtx_VEC_CONCAT (mode, op0, op1)));
36291 return;
36293 case V16QImode:
36294 if (!TARGET_SSE4_1)
36295 break;
36296 /* FALLTHRU */
36298 case V8HImode:
36299 if (!TARGET_SSE2)
36300 break;
36302 /* Don't use ix86_expand_vector_init_interleave if we can't
36303 move from GPR to SSE register directly. */
36304 if (!TARGET_INTER_UNIT_MOVES)
36305 break;
36307 n = GET_MODE_NUNITS (mode);
36308 for (i = 0; i < n; i++)
36309 ops[i] = XVECEXP (vals, 0, i);
36310 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
36311 return;
36313 case V4HImode:
36314 case V8QImode:
36315 break;
36317 default:
36318 gcc_unreachable ();
36322 int i, j, n_elts, n_words, n_elt_per_word;
36323 enum machine_mode inner_mode;
36324 rtx words[4], shift;
36326 inner_mode = GET_MODE_INNER (mode);
36327 n_elts = GET_MODE_NUNITS (mode);
36328 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
36329 n_elt_per_word = n_elts / n_words;
36330 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
36332 for (i = 0; i < n_words; ++i)
36334 rtx word = NULL_RTX;
36336 for (j = 0; j < n_elt_per_word; ++j)
36338 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
36339 elt = convert_modes (word_mode, inner_mode, elt, true);
36341 if (j == 0)
36342 word = elt;
36343 else
36345 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
36346 word, 1, OPTAB_LIB_WIDEN);
36347 word = expand_simple_binop (word_mode, IOR, word, elt,
36348 word, 1, OPTAB_LIB_WIDEN);
36352 words[i] = word;
36355 if (n_words == 1)
36356 emit_move_insn (target, gen_lowpart (mode, words[0]));
36357 else if (n_words == 2)
36359 rtx tmp = gen_reg_rtx (mode);
36360 emit_clobber (tmp);
36361 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
36362 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
36363 emit_move_insn (target, tmp);
36365 else if (n_words == 4)
36367 rtx tmp = gen_reg_rtx (V4SImode);
36368 gcc_assert (word_mode == SImode);
36369 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
36370 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
36371 emit_move_insn (target, gen_lowpart (mode, tmp));
36373 else
36374 gcc_unreachable ();
36378 /* Initialize vector TARGET via VALS. Suppress the use of MMX
36379 instructions unless MMX_OK is true. */
36381 void
36382 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
36384 enum machine_mode mode = GET_MODE (target);
36385 enum machine_mode inner_mode = GET_MODE_INNER (mode);
36386 int n_elts = GET_MODE_NUNITS (mode);
36387 int n_var = 0, one_var = -1;
36388 bool all_same = true, all_const_zero = true;
36389 int i;
36390 rtx x;
36392 for (i = 0; i < n_elts; ++i)
36394 x = XVECEXP (vals, 0, i);
36395 if (!(CONST_INT_P (x)
36396 || GET_CODE (x) == CONST_DOUBLE
36397 || GET_CODE (x) == CONST_FIXED))
36398 n_var++, one_var = i;
36399 else if (x != CONST0_RTX (inner_mode))
36400 all_const_zero = false;
36401 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
36402 all_same = false;
36405 /* Constants are best loaded from the constant pool. */
36406 if (n_var == 0)
36408 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
36409 return;
36412 /* If all values are identical, broadcast the value. */
36413 if (all_same
36414 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
36415 XVECEXP (vals, 0, 0)))
36416 return;
36418 /* Values where only one field is non-constant are best loaded from
36419 the pool and overwritten via move later. */
36420 if (n_var == 1)
36422 if (all_const_zero
36423 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
36424 XVECEXP (vals, 0, one_var),
36425 one_var))
36426 return;
36428 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
36429 return;
36432 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
36435 void
36436 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
36438 enum machine_mode mode = GET_MODE (target);
36439 enum machine_mode inner_mode = GET_MODE_INNER (mode);
36440 enum machine_mode half_mode;
36441 bool use_vec_merge = false;
36442 rtx tmp;
36443 static rtx (*gen_extract[6][2]) (rtx, rtx)
36445 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
36446 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
36447 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
36448 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
36449 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
36450 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
36452 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
36454 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
36455 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
36456 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
36457 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
36458 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
36459 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
36461 int i, j, n;
36463 switch (mode)
36465 case V2SFmode:
36466 case V2SImode:
36467 if (mmx_ok)
36469 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
36470 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
36471 if (elt == 0)
36472 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
36473 else
36474 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
36475 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36476 return;
36478 break;
36480 case V2DImode:
36481 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
36482 if (use_vec_merge)
36483 break;
36485 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
36486 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
36487 if (elt == 0)
36488 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
36489 else
36490 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
36491 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36492 return;
36494 case V2DFmode:
36496 rtx op0, op1;
36498 /* For the two element vectors, we implement a VEC_CONCAT with
36499 the extraction of the other element. */
36501 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
36502 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
36504 if (elt == 0)
36505 op0 = val, op1 = tmp;
36506 else
36507 op0 = tmp, op1 = val;
36509 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
36510 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36512 return;
36514 case V4SFmode:
36515 use_vec_merge = TARGET_SSE4_1;
36516 if (use_vec_merge)
36517 break;
36519 switch (elt)
36521 case 0:
36522 use_vec_merge = true;
36523 break;
36525 case 1:
36526 /* tmp = target = A B C D */
36527 tmp = copy_to_reg (target);
36528 /* target = A A B B */
36529 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
36530 /* target = X A B B */
36531 ix86_expand_vector_set (false, target, val, 0);
36532 /* target = A X C D */
36533 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
36534 const1_rtx, const0_rtx,
36535 GEN_INT (2+4), GEN_INT (3+4)));
36536 return;
36538 case 2:
36539 /* tmp = target = A B C D */
36540 tmp = copy_to_reg (target);
36541 /* tmp = X B C D */
36542 ix86_expand_vector_set (false, tmp, val, 0);
36543 /* target = A B X D */
36544 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
36545 const0_rtx, const1_rtx,
36546 GEN_INT (0+4), GEN_INT (3+4)));
36547 return;
36549 case 3:
36550 /* tmp = target = A B C D */
36551 tmp = copy_to_reg (target);
36552 /* tmp = X B C D */
36553 ix86_expand_vector_set (false, tmp, val, 0);
36554 /* target = A B X D */
36555 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
36556 const0_rtx, const1_rtx,
36557 GEN_INT (2+4), GEN_INT (0+4)));
36558 return;
36560 default:
36561 gcc_unreachable ();
36563 break;
36565 case V4SImode:
36566 use_vec_merge = TARGET_SSE4_1;
36567 if (use_vec_merge)
36568 break;
36570 /* Element 0 handled by vec_merge below. */
36571 if (elt == 0)
36573 use_vec_merge = true;
36574 break;
36577 if (TARGET_SSE2)
36579 /* With SSE2, use integer shuffles to swap element 0 and ELT,
36580 store into element 0, then shuffle them back. */
36582 rtx order[4];
36584 order[0] = GEN_INT (elt);
36585 order[1] = const1_rtx;
36586 order[2] = const2_rtx;
36587 order[3] = GEN_INT (3);
36588 order[elt] = const0_rtx;
36590 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
36591 order[1], order[2], order[3]));
36593 ix86_expand_vector_set (false, target, val, 0);
36595 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
36596 order[1], order[2], order[3]));
36598 else
36600 /* For SSE1, we have to reuse the V4SF code. */
36601 ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
36602 gen_lowpart (SFmode, val), elt);
36604 return;
36606 case V8HImode:
36607 use_vec_merge = TARGET_SSE2;
36608 break;
36609 case V4HImode:
36610 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
36611 break;
36613 case V16QImode:
36614 use_vec_merge = TARGET_SSE4_1;
36615 break;
36617 case V8QImode:
36618 break;
36620 case V32QImode:
36621 half_mode = V16QImode;
36622 j = 0;
36623 n = 16;
36624 goto half;
36626 case V16HImode:
36627 half_mode = V8HImode;
36628 j = 1;
36629 n = 8;
36630 goto half;
36632 case V8SImode:
36633 half_mode = V4SImode;
36634 j = 2;
36635 n = 4;
36636 goto half;
36638 case V4DImode:
36639 half_mode = V2DImode;
36640 j = 3;
36641 n = 2;
36642 goto half;
36644 case V8SFmode:
36645 half_mode = V4SFmode;
36646 j = 4;
36647 n = 4;
36648 goto half;
36650 case V4DFmode:
36651 half_mode = V2DFmode;
36652 j = 5;
36653 n = 2;
36654 goto half;
36656 half:
36657 /* Compute offset. */
36658 i = elt / n;
36659 elt %= n;
36661 gcc_assert (i <= 1);
36663 /* Extract the half. */
36664 tmp = gen_reg_rtx (half_mode);
36665 emit_insn (gen_extract[j][i] (tmp, target));
36667 /* Put val in tmp at elt. */
36668 ix86_expand_vector_set (false, tmp, val, elt);
36670 /* Put it back. */
36671 emit_insn (gen_insert[j][i] (target, target, tmp));
36672 return;
36674 default:
36675 break;
36678 if (use_vec_merge)
36680 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
36681 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
36682 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36684 else
36686 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
36688 emit_move_insn (mem, target);
36690 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
36691 emit_move_insn (tmp, val);
36693 emit_move_insn (target, mem);
36697 void
36698 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
36700 enum machine_mode mode = GET_MODE (vec);
36701 enum machine_mode inner_mode = GET_MODE_INNER (mode);
36702 bool use_vec_extr = false;
36703 rtx tmp;
36705 switch (mode)
36707 case V2SImode:
36708 case V2SFmode:
36709 if (!mmx_ok)
36710 break;
36711 /* FALLTHRU */
36713 case V2DFmode:
36714 case V2DImode:
36715 use_vec_extr = true;
36716 break;
36718 case V4SFmode:
36719 use_vec_extr = TARGET_SSE4_1;
36720 if (use_vec_extr)
36721 break;
36723 switch (elt)
36725 case 0:
36726 tmp = vec;
36727 break;
36729 case 1:
36730 case 3:
36731 tmp = gen_reg_rtx (mode);
36732 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
36733 GEN_INT (elt), GEN_INT (elt),
36734 GEN_INT (elt+4), GEN_INT (elt+4)));
36735 break;
36737 case 2:
36738 tmp = gen_reg_rtx (mode);
36739 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
36740 break;
36742 default:
36743 gcc_unreachable ();
36745 vec = tmp;
36746 use_vec_extr = true;
36747 elt = 0;
36748 break;
36750 case V4SImode:
36751 use_vec_extr = TARGET_SSE4_1;
36752 if (use_vec_extr)
36753 break;
36755 if (TARGET_SSE2)
36757 switch (elt)
36759 case 0:
36760 tmp = vec;
36761 break;
36763 case 1:
36764 case 3:
36765 tmp = gen_reg_rtx (mode);
36766 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
36767 GEN_INT (elt), GEN_INT (elt),
36768 GEN_INT (elt), GEN_INT (elt)));
36769 break;
36771 case 2:
36772 tmp = gen_reg_rtx (mode);
36773 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
36774 break;
36776 default:
36777 gcc_unreachable ();
36779 vec = tmp;
36780 use_vec_extr = true;
36781 elt = 0;
36783 else
36785 /* For SSE1, we have to reuse the V4SF code. */
36786 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
36787 gen_lowpart (V4SFmode, vec), elt);
36788 return;
36790 break;
36792 case V8HImode:
36793 use_vec_extr = TARGET_SSE2;
36794 break;
36795 case V4HImode:
36796 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
36797 break;
36799 case V16QImode:
36800 use_vec_extr = TARGET_SSE4_1;
36801 break;
36803 case V8SFmode:
36804 if (TARGET_AVX)
36806 tmp = gen_reg_rtx (V4SFmode);
36807 if (elt < 4)
36808 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
36809 else
36810 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
36811 ix86_expand_vector_extract (false, target, tmp, elt & 3);
36812 return;
36814 break;
36816 case V4DFmode:
36817 if (TARGET_AVX)
36819 tmp = gen_reg_rtx (V2DFmode);
36820 if (elt < 2)
36821 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
36822 else
36823 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
36824 ix86_expand_vector_extract (false, target, tmp, elt & 1);
36825 return;
36827 break;
36829 case V32QImode:
36830 if (TARGET_AVX)
36832 tmp = gen_reg_rtx (V16QImode);
36833 if (elt < 16)
36834 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
36835 else
36836 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
36837 ix86_expand_vector_extract (false, target, tmp, elt & 15);
36838 return;
36840 break;
36842 case V16HImode:
36843 if (TARGET_AVX)
36845 tmp = gen_reg_rtx (V8HImode);
36846 if (elt < 8)
36847 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
36848 else
36849 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
36850 ix86_expand_vector_extract (false, target, tmp, elt & 7);
36851 return;
36853 break;
36855 case V8SImode:
36856 if (TARGET_AVX)
36858 tmp = gen_reg_rtx (V4SImode);
36859 if (elt < 4)
36860 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
36861 else
36862 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
36863 ix86_expand_vector_extract (false, target, tmp, elt & 3);
36864 return;
36866 break;
36868 case V4DImode:
36869 if (TARGET_AVX)
36871 tmp = gen_reg_rtx (V2DImode);
36872 if (elt < 2)
36873 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
36874 else
36875 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
36876 ix86_expand_vector_extract (false, target, tmp, elt & 1);
36877 return;
36879 break;
36881 case V8QImode:
36882 /* ??? Could extract the appropriate HImode element and shift. */
36883 default:
36884 break;
36887 if (use_vec_extr)
36889 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
36890 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
36892 /* Let the rtl optimizers know about the zero extension performed. */
36893 if (inner_mode == QImode || inner_mode == HImode)
36895 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
36896 target = gen_lowpart (SImode, target);
36899 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36901 else
36903 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
36905 emit_move_insn (mem, vec);
36907 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
36908 emit_move_insn (target, tmp);
36912 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
36913 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
36914 The upper bits of DEST are undefined, though they shouldn't cause
36915 exceptions (some bits from src or all zeros are ok). */
36917 static void
36918 emit_reduc_half (rtx dest, rtx src, int i)
36920 rtx tem;
36921 switch (GET_MODE (src))
36923 case V4SFmode:
36924 if (i == 128)
36925 tem = gen_sse_movhlps (dest, src, src);
36926 else
36927 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
36928 GEN_INT (1 + 4), GEN_INT (1 + 4));
36929 break;
36930 case V2DFmode:
36931 tem = gen_vec_interleave_highv2df (dest, src, src);
36932 break;
36933 case V16QImode:
36934 case V8HImode:
36935 case V4SImode:
36936 case V2DImode:
36937 tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest),
36938 gen_lowpart (V1TImode, src),
36939 GEN_INT (i / 2));
36940 break;
36941 case V8SFmode:
36942 if (i == 256)
36943 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
36944 else
36945 tem = gen_avx_shufps256 (dest, src, src,
36946 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
36947 break;
36948 case V4DFmode:
36949 if (i == 256)
36950 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
36951 else
36952 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
36953 break;
36954 case V32QImode:
36955 case V16HImode:
36956 case V8SImode:
36957 case V4DImode:
36958 if (i == 256)
36959 tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest),
36960 gen_lowpart (V4DImode, src),
36961 gen_lowpart (V4DImode, src),
36962 const1_rtx);
36963 else
36964 tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest),
36965 gen_lowpart (V2TImode, src),
36966 GEN_INT (i / 2));
36967 break;
36968 default:
36969 gcc_unreachable ();
36971 emit_insn (tem);
36974 /* Expand a vector reduction. FN is the binary pattern to reduce;
36975 DEST is the destination; IN is the input vector. */
36977 void
36978 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
36980 rtx half, dst, vec = in;
36981 enum machine_mode mode = GET_MODE (in);
36982 int i;
36984 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
36985 if (TARGET_SSE4_1
36986 && mode == V8HImode
36987 && fn == gen_uminv8hi3)
36989 emit_insn (gen_sse4_1_phminposuw (dest, in));
36990 return;
36993 for (i = GET_MODE_BITSIZE (mode);
36994 i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
36995 i >>= 1)
36997 half = gen_reg_rtx (mode);
36998 emit_reduc_half (half, vec, i);
36999 if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
37000 dst = dest;
37001 else
37002 dst = gen_reg_rtx (mode);
37003 emit_insn (fn (dst, half, vec));
37004 vec = dst;
37008 /* Target hook for scalar_mode_supported_p. */
37009 static bool
37010 ix86_scalar_mode_supported_p (enum machine_mode mode)
37012 if (DECIMAL_FLOAT_MODE_P (mode))
37013 return default_decimal_float_supported_p ();
37014 else if (mode == TFmode)
37015 return true;
37016 else
37017 return default_scalar_mode_supported_p (mode);
37020 /* Implements target hook vector_mode_supported_p. */
37021 static bool
37022 ix86_vector_mode_supported_p (enum machine_mode mode)
37024 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
37025 return true;
37026 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
37027 return true;
37028 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
37029 return true;
37030 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
37031 return true;
37032 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
37033 return true;
37034 return false;
37037 /* Target hook for c_mode_for_suffix. */
37038 static enum machine_mode
37039 ix86_c_mode_for_suffix (char suffix)
37041 if (suffix == 'q')
37042 return TFmode;
37043 if (suffix == 'w')
37044 return XFmode;
37046 return VOIDmode;
37049 /* Worker function for TARGET_MD_ASM_CLOBBERS.
37051 We do this in the new i386 backend to maintain source compatibility
37052 with the old cc0-based compiler. */
37054 static tree
37055 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
37056 tree inputs ATTRIBUTE_UNUSED,
37057 tree clobbers)
37059 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
37060 clobbers);
37061 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
37062 clobbers);
37063 return clobbers;
37066 /* Implements target vector targetm.asm.encode_section_info. */
37068 static void ATTRIBUTE_UNUSED
37069 ix86_encode_section_info (tree decl, rtx rtl, int first)
37071 default_encode_section_info (decl, rtl, first);
37073 if (TREE_CODE (decl) == VAR_DECL
37074 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
37075 && ix86_in_large_data_p (decl))
37076 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
37079 /* Worker function for REVERSE_CONDITION. */
37081 enum rtx_code
37082 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
37084 return (mode != CCFPmode && mode != CCFPUmode
37085 ? reverse_condition (code)
37086 : reverse_condition_maybe_unordered (code));
37089 /* Output code to perform an x87 FP register move, from OPERANDS[1]
37090 to OPERANDS[0]. */
37092 const char *
37093 output_387_reg_move (rtx insn, rtx *operands)
37095 if (REG_P (operands[0]))
37097 if (REG_P (operands[1])
37098 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
37100 if (REGNO (operands[0]) == FIRST_STACK_REG)
37101 return output_387_ffreep (operands, 0);
37102 return "fstp\t%y0";
37104 if (STACK_TOP_P (operands[0]))
37105 return "fld%Z1\t%y1";
37106 return "fst\t%y0";
37108 else if (MEM_P (operands[0]))
37110 gcc_assert (REG_P (operands[1]));
37111 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
37112 return "fstp%Z0\t%y0";
37113 else
37115 /* There is no non-popping store to memory for XFmode.
37116 So if we need one, follow the store with a load. */
37117 if (GET_MODE (operands[0]) == XFmode)
37118 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
37119 else
37120 return "fst%Z0\t%y0";
37123 else
37124 gcc_unreachable();
37127 /* Output code to perform a conditional jump to LABEL, if C2 flag in
37128 FP status register is set. */
37130 void
37131 ix86_emit_fp_unordered_jump (rtx label)
37133 rtx reg = gen_reg_rtx (HImode);
37134 rtx temp;
37136 emit_insn (gen_x86_fnstsw_1 (reg));
37138 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
37140 emit_insn (gen_x86_sahf_1 (reg));
37142 temp = gen_rtx_REG (CCmode, FLAGS_REG);
37143 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
37145 else
37147 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
37149 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
37150 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
37153 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
37154 gen_rtx_LABEL_REF (VOIDmode, label),
37155 pc_rtx);
37156 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
37158 emit_jump_insn (temp);
37159 predict_jump (REG_BR_PROB_BASE * 10 / 100);
37162 /* Output code to perform a log1p XFmode calculation. */
37164 void ix86_emit_i387_log1p (rtx op0, rtx op1)
37166 rtx label1 = gen_label_rtx ();
37167 rtx label2 = gen_label_rtx ();
37169 rtx tmp = gen_reg_rtx (XFmode);
37170 rtx tmp2 = gen_reg_rtx (XFmode);
37171 rtx test;
37173 emit_insn (gen_absxf2 (tmp, op1));
37174 test = gen_rtx_GE (VOIDmode, tmp,
37175 CONST_DOUBLE_FROM_REAL_VALUE (
37176 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
37177 XFmode));
37178 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
37180 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
37181 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
37182 emit_jump (label2);
37184 emit_label (label1);
37185 emit_move_insn (tmp, CONST1_RTX (XFmode));
37186 emit_insn (gen_addxf3 (tmp, op1, tmp));
37187 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
37188 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
37190 emit_label (label2);
37193 /* Emit code for round calculation. */
37194 void ix86_emit_i387_round (rtx op0, rtx op1)
37196 enum machine_mode inmode = GET_MODE (op1);
37197 enum machine_mode outmode = GET_MODE (op0);
37198 rtx e1, e2, res, tmp, tmp1, half;
37199 rtx scratch = gen_reg_rtx (HImode);
37200 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
37201 rtx jump_label = gen_label_rtx ();
37202 rtx insn;
37203 rtx (*gen_abs) (rtx, rtx);
37204 rtx (*gen_neg) (rtx, rtx);
37206 switch (inmode)
37208 case SFmode:
37209 gen_abs = gen_abssf2;
37210 break;
37211 case DFmode:
37212 gen_abs = gen_absdf2;
37213 break;
37214 case XFmode:
37215 gen_abs = gen_absxf2;
37216 break;
37217 default:
37218 gcc_unreachable ();
37221 switch (outmode)
37223 case SFmode:
37224 gen_neg = gen_negsf2;
37225 break;
37226 case DFmode:
37227 gen_neg = gen_negdf2;
37228 break;
37229 case XFmode:
37230 gen_neg = gen_negxf2;
37231 break;
37232 case HImode:
37233 gen_neg = gen_neghi2;
37234 break;
37235 case SImode:
37236 gen_neg = gen_negsi2;
37237 break;
37238 case DImode:
37239 gen_neg = gen_negdi2;
37240 break;
37241 default:
37242 gcc_unreachable ();
37245 e1 = gen_reg_rtx (inmode);
37246 e2 = gen_reg_rtx (inmode);
37247 res = gen_reg_rtx (outmode);
37249 half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
37251 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
37253 /* scratch = fxam(op1) */
37254 emit_insn (gen_rtx_SET (VOIDmode, scratch,
37255 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
37256 UNSPEC_FXAM)));
37257 /* e1 = fabs(op1) */
37258 emit_insn (gen_abs (e1, op1));
37260 /* e2 = e1 + 0.5 */
37261 half = force_reg (inmode, half);
37262 emit_insn (gen_rtx_SET (VOIDmode, e2,
37263 gen_rtx_PLUS (inmode, e1, half)));
37265 /* res = floor(e2) */
37266 if (inmode != XFmode)
37268 tmp1 = gen_reg_rtx (XFmode);
37270 emit_insn (gen_rtx_SET (VOIDmode, tmp1,
37271 gen_rtx_FLOAT_EXTEND (XFmode, e2)));
37273 else
37274 tmp1 = e2;
37276 switch (outmode)
37278 case SFmode:
37279 case DFmode:
37281 rtx tmp0 = gen_reg_rtx (XFmode);
37283 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
37285 emit_insn (gen_rtx_SET (VOIDmode, res,
37286 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
37287 UNSPEC_TRUNC_NOOP)));
37289 break;
37290 case XFmode:
37291 emit_insn (gen_frndintxf2_floor (res, tmp1));
37292 break;
37293 case HImode:
37294 emit_insn (gen_lfloorxfhi2 (res, tmp1));
37295 break;
37296 case SImode:
37297 emit_insn (gen_lfloorxfsi2 (res, tmp1));
37298 break;
37299 case DImode:
37300 emit_insn (gen_lfloorxfdi2 (res, tmp1));
37301 break;
37302 default:
37303 gcc_unreachable ();
37306 /* flags = signbit(a) */
37307 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
37309 /* if (flags) then res = -res */
37310 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
37311 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
37312 gen_rtx_LABEL_REF (VOIDmode, jump_label),
37313 pc_rtx);
37314 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
37315 predict_jump (REG_BR_PROB_BASE * 50 / 100);
37316 JUMP_LABEL (insn) = jump_label;
37318 emit_insn (gen_neg (res, res));
37320 emit_label (jump_label);
37321 LABEL_NUSES (jump_label) = 1;
37323 emit_move_insn (op0, res);
37326 /* Output code to perform a Newton-Rhapson approximation of a single precision
37327 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
37329 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
37331 rtx x0, x1, e0, e1;
37333 x0 = gen_reg_rtx (mode);
37334 e0 = gen_reg_rtx (mode);
37335 e1 = gen_reg_rtx (mode);
37336 x1 = gen_reg_rtx (mode);
37338 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
37340 b = force_reg (mode, b);
37342 /* x0 = rcp(b) estimate */
37343 emit_insn (gen_rtx_SET (VOIDmode, x0,
37344 gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
37345 UNSPEC_RCP)));
37346 /* e0 = x0 * b */
37347 emit_insn (gen_rtx_SET (VOIDmode, e0,
37348 gen_rtx_MULT (mode, x0, b)));
37350 /* e0 = x0 * e0 */
37351 emit_insn (gen_rtx_SET (VOIDmode, e0,
37352 gen_rtx_MULT (mode, x0, e0)));
37354 /* e1 = x0 + x0 */
37355 emit_insn (gen_rtx_SET (VOIDmode, e1,
37356 gen_rtx_PLUS (mode, x0, x0)));
37358 /* x1 = e1 - e0 */
37359 emit_insn (gen_rtx_SET (VOIDmode, x1,
37360 gen_rtx_MINUS (mode, e1, e0)));
37362 /* res = a * x1 */
37363 emit_insn (gen_rtx_SET (VOIDmode, res,
37364 gen_rtx_MULT (mode, a, x1)));
37367 /* Output code to perform a Newton-Rhapson approximation of a
37368 single precision floating point [reciprocal] square root. */
37370 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
37371 bool recip)
37373 rtx x0, e0, e1, e2, e3, mthree, mhalf;
37374 REAL_VALUE_TYPE r;
37376 x0 = gen_reg_rtx (mode);
37377 e0 = gen_reg_rtx (mode);
37378 e1 = gen_reg_rtx (mode);
37379 e2 = gen_reg_rtx (mode);
37380 e3 = gen_reg_rtx (mode);
37382 real_from_integer (&r, VOIDmode, -3, -1, 0);
37383 mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
37385 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
37386 mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
37388 if (VECTOR_MODE_P (mode))
37390 mthree = ix86_build_const_vector (mode, true, mthree);
37391 mhalf = ix86_build_const_vector (mode, true, mhalf);
37394 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
37395 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
37397 a = force_reg (mode, a);
37399 /* x0 = rsqrt(a) estimate */
37400 emit_insn (gen_rtx_SET (VOIDmode, x0,
37401 gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
37402 UNSPEC_RSQRT)));
37404 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
37405 if (!recip)
37407 rtx zero, mask;
37409 zero = gen_reg_rtx (mode);
37410 mask = gen_reg_rtx (mode);
37412 zero = force_reg (mode, CONST0_RTX(mode));
37413 emit_insn (gen_rtx_SET (VOIDmode, mask,
37414 gen_rtx_NE (mode, zero, a)));
37416 emit_insn (gen_rtx_SET (VOIDmode, x0,
37417 gen_rtx_AND (mode, x0, mask)));
37420 /* e0 = x0 * a */
37421 emit_insn (gen_rtx_SET (VOIDmode, e0,
37422 gen_rtx_MULT (mode, x0, a)));
37423 /* e1 = e0 * x0 */
37424 emit_insn (gen_rtx_SET (VOIDmode, e1,
37425 gen_rtx_MULT (mode, e0, x0)));
37427 /* e2 = e1 - 3. */
37428 mthree = force_reg (mode, mthree);
37429 emit_insn (gen_rtx_SET (VOIDmode, e2,
37430 gen_rtx_PLUS (mode, e1, mthree)));
37432 mhalf = force_reg (mode, mhalf);
37433 if (recip)
37434 /* e3 = -.5 * x0 */
37435 emit_insn (gen_rtx_SET (VOIDmode, e3,
37436 gen_rtx_MULT (mode, x0, mhalf)));
37437 else
37438 /* e3 = -.5 * e0 */
37439 emit_insn (gen_rtx_SET (VOIDmode, e3,
37440 gen_rtx_MULT (mode, e0, mhalf)));
37441 /* ret = e2 * e3 */
37442 emit_insn (gen_rtx_SET (VOIDmode, res,
37443 gen_rtx_MULT (mode, e2, e3)));
37446 #ifdef TARGET_SOLARIS
37447 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
37449 static void
37450 i386_solaris_elf_named_section (const char *name, unsigned int flags,
37451 tree decl)
37453 /* With Binutils 2.15, the "@unwind" marker must be specified on
37454 every occurrence of the ".eh_frame" section, not just the first
37455 one. */
37456 if (TARGET_64BIT
37457 && strcmp (name, ".eh_frame") == 0)
37459 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
37460 flags & SECTION_WRITE ? "aw" : "a");
37461 return;
37464 #ifndef USE_GAS
37465 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
37467 solaris_elf_asm_comdat_section (name, flags, decl);
37468 return;
37470 #endif
37472 default_elf_asm_named_section (name, flags, decl);
37474 #endif /* TARGET_SOLARIS */
37476 /* Return the mangling of TYPE if it is an extended fundamental type. */
37478 static const char *
37479 ix86_mangle_type (const_tree type)
37481 type = TYPE_MAIN_VARIANT (type);
37483 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
37484 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
37485 return NULL;
37487 switch (TYPE_MODE (type))
37489 case TFmode:
37490 /* __float128 is "g". */
37491 return "g";
37492 case XFmode:
37493 /* "long double" or __float80 is "e". */
37494 return "e";
37495 default:
37496 return NULL;
37500 /* For 32-bit code we can save PIC register setup by using
37501 __stack_chk_fail_local hidden function instead of calling
37502 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
37503 register, so it is better to call __stack_chk_fail directly. */
37505 static tree ATTRIBUTE_UNUSED
37506 ix86_stack_protect_fail (void)
37508 return TARGET_64BIT
37509 ? default_external_stack_protect_fail ()
37510 : default_hidden_stack_protect_fail ();
37513 /* Select a format to encode pointers in exception handling data. CODE
37514 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
37515 true if the symbol may be affected by dynamic relocations.
37517 ??? All x86 object file formats are capable of representing this.
37518 After all, the relocation needed is the same as for the call insn.
37519 Whether or not a particular assembler allows us to enter such, I
37520 guess we'll have to see. */
37522 asm_preferred_eh_data_format (int code, int global)
37524 if (flag_pic)
37526 int type = DW_EH_PE_sdata8;
37527 if (!TARGET_64BIT
37528 || ix86_cmodel == CM_SMALL_PIC
37529 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
37530 type = DW_EH_PE_sdata4;
37531 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
37533 if (ix86_cmodel == CM_SMALL
37534 || (ix86_cmodel == CM_MEDIUM && code))
37535 return DW_EH_PE_udata4;
37536 return DW_EH_PE_absptr;
37539 /* Expand copysign from SIGN to the positive value ABS_VALUE
37540 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
37541 the sign-bit. */
37542 static void
37543 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
37545 enum machine_mode mode = GET_MODE (sign);
37546 rtx sgn = gen_reg_rtx (mode);
37547 if (mask == NULL_RTX)
37549 enum machine_mode vmode;
37551 if (mode == SFmode)
37552 vmode = V4SFmode;
37553 else if (mode == DFmode)
37554 vmode = V2DFmode;
37555 else
37556 vmode = mode;
37558 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
37559 if (!VECTOR_MODE_P (mode))
37561 /* We need to generate a scalar mode mask in this case. */
37562 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
37563 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
37564 mask = gen_reg_rtx (mode);
37565 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
37568 else
37569 mask = gen_rtx_NOT (mode, mask);
37570 emit_insn (gen_rtx_SET (VOIDmode, sgn,
37571 gen_rtx_AND (mode, mask, sign)));
37572 emit_insn (gen_rtx_SET (VOIDmode, result,
37573 gen_rtx_IOR (mode, abs_value, sgn)));
37576 /* Expand fabs (OP0) and return a new rtx that holds the result. The
37577 mask for masking out the sign-bit is stored in *SMASK, if that is
37578 non-null. */
37579 static rtx
37580 ix86_expand_sse_fabs (rtx op0, rtx *smask)
37582 enum machine_mode vmode, mode = GET_MODE (op0);
37583 rtx xa, mask;
37585 xa = gen_reg_rtx (mode);
37586 if (mode == SFmode)
37587 vmode = V4SFmode;
37588 else if (mode == DFmode)
37589 vmode = V2DFmode;
37590 else
37591 vmode = mode;
37592 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
37593 if (!VECTOR_MODE_P (mode))
37595 /* We need to generate a scalar mode mask in this case. */
37596 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
37597 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
37598 mask = gen_reg_rtx (mode);
37599 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
37601 emit_insn (gen_rtx_SET (VOIDmode, xa,
37602 gen_rtx_AND (mode, op0, mask)));
37604 if (smask)
37605 *smask = mask;
37607 return xa;
37610 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
37611 swapping the operands if SWAP_OPERANDS is true. The expanded
37612 code is a forward jump to a newly created label in case the
37613 comparison is true. The generated label rtx is returned. */
37614 static rtx
37615 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
37616 bool swap_operands)
37618 rtx label, tmp;
37620 if (swap_operands)
37622 tmp = op0;
37623 op0 = op1;
37624 op1 = tmp;
37627 label = gen_label_rtx ();
37628 tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG);
37629 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37630 gen_rtx_COMPARE (CCFPUmode, op0, op1)));
37631 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
37632 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
37633 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
37634 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
37635 JUMP_LABEL (tmp) = label;
37637 return label;
37640 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
37641 using comparison code CODE. Operands are swapped for the comparison if
37642 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
37643 static rtx
37644 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
37645 bool swap_operands)
37647 rtx (*insn)(rtx, rtx, rtx, rtx);
37648 enum machine_mode mode = GET_MODE (op0);
37649 rtx mask = gen_reg_rtx (mode);
37651 if (swap_operands)
37653 rtx tmp = op0;
37654 op0 = op1;
37655 op1 = tmp;
37658 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
37660 emit_insn (insn (mask, op0, op1,
37661 gen_rtx_fmt_ee (code, mode, op0, op1)));
37662 return mask;
37665 /* Generate and return a rtx of mode MODE for 2**n where n is the number
37666 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
37667 static rtx
37668 ix86_gen_TWO52 (enum machine_mode mode)
37670 REAL_VALUE_TYPE TWO52r;
37671 rtx TWO52;
37673 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
37674 TWO52 = const_double_from_real_value (TWO52r, mode);
37675 TWO52 = force_reg (mode, TWO52);
37677 return TWO52;
37680 /* Expand SSE sequence for computing lround from OP1 storing
37681 into OP0. */
37682 void
37683 ix86_expand_lround (rtx op0, rtx op1)
37685 /* C code for the stuff we're doing below:
37686 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
37687 return (long)tmp;
37689 enum machine_mode mode = GET_MODE (op1);
37690 const struct real_format *fmt;
37691 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
37692 rtx adj;
37694 /* load nextafter (0.5, 0.0) */
37695 fmt = REAL_MODE_FORMAT (mode);
37696 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
37697 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
37699 /* adj = copysign (0.5, op1) */
37700 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
37701 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
37703 /* adj = op1 + adj */
37704 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
37706 /* op0 = (imode)adj */
37707 expand_fix (op0, adj, 0);
37710 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
37711 into OPERAND0. */
37712 void
37713 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
37715 /* C code for the stuff we're doing below (for do_floor):
37716 xi = (long)op1;
37717 xi -= (double)xi > op1 ? 1 : 0;
37718 return xi;
37720 enum machine_mode fmode = GET_MODE (op1);
37721 enum machine_mode imode = GET_MODE (op0);
37722 rtx ireg, freg, label, tmp;
37724 /* reg = (long)op1 */
37725 ireg = gen_reg_rtx (imode);
37726 expand_fix (ireg, op1, 0);
37728 /* freg = (double)reg */
37729 freg = gen_reg_rtx (fmode);
37730 expand_float (freg, ireg, 0);
37732 /* ireg = (freg > op1) ? ireg - 1 : ireg */
37733 label = ix86_expand_sse_compare_and_jump (UNLE,
37734 freg, op1, !do_floor);
37735 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
37736 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
37737 emit_move_insn (ireg, tmp);
37739 emit_label (label);
37740 LABEL_NUSES (label) = 1;
37742 emit_move_insn (op0, ireg);
37745 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
37746 result in OPERAND0. */
37747 void
37748 ix86_expand_rint (rtx operand0, rtx operand1)
37750 /* C code for the stuff we're doing below:
37751 xa = fabs (operand1);
37752 if (!isless (xa, 2**52))
37753 return operand1;
37754 xa = xa + 2**52 - 2**52;
37755 return copysign (xa, operand1);
37757 enum machine_mode mode = GET_MODE (operand0);
37758 rtx res, xa, label, TWO52, mask;
37760 res = gen_reg_rtx (mode);
37761 emit_move_insn (res, operand1);
37763 /* xa = abs (operand1) */
37764 xa = ix86_expand_sse_fabs (res, &mask);
37766 /* if (!isless (xa, TWO52)) goto label; */
37767 TWO52 = ix86_gen_TWO52 (mode);
37768 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37770 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
37771 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
37773 ix86_sse_copysign_to_positive (res, xa, res, mask);
37775 emit_label (label);
37776 LABEL_NUSES (label) = 1;
37778 emit_move_insn (operand0, res);
37781 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
37782 into OPERAND0. */
37783 void
37784 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
37786 /* C code for the stuff we expand below.
37787 double xa = fabs (x), x2;
37788 if (!isless (xa, TWO52))
37789 return x;
37790 xa = xa + TWO52 - TWO52;
37791 x2 = copysign (xa, x);
37792 Compensate. Floor:
37793 if (x2 > x)
37794 x2 -= 1;
37795 Compensate. Ceil:
37796 if (x2 < x)
37797 x2 -= -1;
37798 return x2;
37800 enum machine_mode mode = GET_MODE (operand0);
37801 rtx xa, TWO52, tmp, label, one, res, mask;
37803 TWO52 = ix86_gen_TWO52 (mode);
37805 /* Temporary for holding the result, initialized to the input
37806 operand to ease control flow. */
37807 res = gen_reg_rtx (mode);
37808 emit_move_insn (res, operand1);
37810 /* xa = abs (operand1) */
37811 xa = ix86_expand_sse_fabs (res, &mask);
37813 /* if (!isless (xa, TWO52)) goto label; */
37814 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37816 /* xa = xa + TWO52 - TWO52; */
37817 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
37818 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
37820 /* xa = copysign (xa, operand1) */
37821 ix86_sse_copysign_to_positive (xa, xa, res, mask);
37823 /* generate 1.0 or -1.0 */
37824 one = force_reg (mode,
37825 const_double_from_real_value (do_floor
37826 ? dconst1 : dconstm1, mode));
37828 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
37829 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
37830 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37831 gen_rtx_AND (mode, one, tmp)));
37832 /* We always need to subtract here to preserve signed zero. */
37833 tmp = expand_simple_binop (mode, MINUS,
37834 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
37835 emit_move_insn (res, tmp);
37837 emit_label (label);
37838 LABEL_NUSES (label) = 1;
37840 emit_move_insn (operand0, res);
37843 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
37844 into OPERAND0. */
37845 void
37846 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
37848 /* C code for the stuff we expand below.
37849 double xa = fabs (x), x2;
37850 if (!isless (xa, TWO52))
37851 return x;
37852 x2 = (double)(long)x;
37853 Compensate. Floor:
37854 if (x2 > x)
37855 x2 -= 1;
37856 Compensate. Ceil:
37857 if (x2 < x)
37858 x2 += 1;
37859 if (HONOR_SIGNED_ZEROS (mode))
37860 return copysign (x2, x);
37861 return x2;
37863 enum machine_mode mode = GET_MODE (operand0);
37864 rtx xa, xi, TWO52, tmp, label, one, res, mask;
37866 TWO52 = ix86_gen_TWO52 (mode);
37868 /* Temporary for holding the result, initialized to the input
37869 operand to ease control flow. */
37870 res = gen_reg_rtx (mode);
37871 emit_move_insn (res, operand1);
37873 /* xa = abs (operand1) */
37874 xa = ix86_expand_sse_fabs (res, &mask);
37876 /* if (!isless (xa, TWO52)) goto label; */
37877 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37879 /* xa = (double)(long)x */
37880 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
37881 expand_fix (xi, res, 0);
37882 expand_float (xa, xi, 0);
37884 /* generate 1.0 */
37885 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
37887 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
37888 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
37889 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37890 gen_rtx_AND (mode, one, tmp)));
37891 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
37892 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
37893 emit_move_insn (res, tmp);
37895 if (HONOR_SIGNED_ZEROS (mode))
37896 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
37898 emit_label (label);
37899 LABEL_NUSES (label) = 1;
37901 emit_move_insn (operand0, res);
37904 /* Expand SSE sequence for computing round from OPERAND1 storing
37905 into OPERAND0. Sequence that works without relying on DImode truncation
37906 via cvttsd2siq that is only available on 64bit targets. */
37907 void
37908 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
37910 /* C code for the stuff we expand below.
37911 double xa = fabs (x), xa2, x2;
37912 if (!isless (xa, TWO52))
37913 return x;
37914 Using the absolute value and copying back sign makes
37915 -0.0 -> -0.0 correct.
37916 xa2 = xa + TWO52 - TWO52;
37917 Compensate.
37918 dxa = xa2 - xa;
37919 if (dxa <= -0.5)
37920 xa2 += 1;
37921 else if (dxa > 0.5)
37922 xa2 -= 1;
37923 x2 = copysign (xa2, x);
37924 return x2;
37926 enum machine_mode mode = GET_MODE (operand0);
37927 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
37929 TWO52 = ix86_gen_TWO52 (mode);
37931 /* Temporary for holding the result, initialized to the input
37932 operand to ease control flow. */
37933 res = gen_reg_rtx (mode);
37934 emit_move_insn (res, operand1);
37936 /* xa = abs (operand1) */
37937 xa = ix86_expand_sse_fabs (res, &mask);
37939 /* if (!isless (xa, TWO52)) goto label; */
37940 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37942 /* xa2 = xa + TWO52 - TWO52; */
37943 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
37944 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
37946 /* dxa = xa2 - xa; */
37947 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
37949 /* generate 0.5, 1.0 and -0.5 */
37950 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
37951 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
37952 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
37953 0, OPTAB_DIRECT);
37955 /* Compensate. */
37956 tmp = gen_reg_rtx (mode);
37957 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
37958 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
37959 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37960 gen_rtx_AND (mode, one, tmp)));
37961 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
37962 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
37963 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
37964 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37965 gen_rtx_AND (mode, one, tmp)));
37966 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
37968 /* res = copysign (xa2, operand1) */
37969 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
37971 emit_label (label);
37972 LABEL_NUSES (label) = 1;
37974 emit_move_insn (operand0, res);
37977 /* Expand SSE sequence for computing trunc from OPERAND1 storing
37978 into OPERAND0. */
37979 void
37980 ix86_expand_trunc (rtx operand0, rtx operand1)
37982 /* C code for SSE variant we expand below.
37983 double xa = fabs (x), x2;
37984 if (!isless (xa, TWO52))
37985 return x;
37986 x2 = (double)(long)x;
37987 if (HONOR_SIGNED_ZEROS (mode))
37988 return copysign (x2, x);
37989 return x2;
37991 enum machine_mode mode = GET_MODE (operand0);
37992 rtx xa, xi, TWO52, label, res, mask;
37994 TWO52 = ix86_gen_TWO52 (mode);
37996 /* Temporary for holding the result, initialized to the input
37997 operand to ease control flow. */
37998 res = gen_reg_rtx (mode);
37999 emit_move_insn (res, operand1);
38001 /* xa = abs (operand1) */
38002 xa = ix86_expand_sse_fabs (res, &mask);
38004 /* if (!isless (xa, TWO52)) goto label; */
38005 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38007 /* x = (double)(long)x */
38008 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
38009 expand_fix (xi, res, 0);
38010 expand_float (res, xi, 0);
38012 if (HONOR_SIGNED_ZEROS (mode))
38013 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
38015 emit_label (label);
38016 LABEL_NUSES (label) = 1;
38018 emit_move_insn (operand0, res);
38021 /* Expand SSE sequence for computing trunc from OPERAND1 storing
38022 into OPERAND0. */
38023 void
38024 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
38026 enum machine_mode mode = GET_MODE (operand0);
38027 rtx xa, mask, TWO52, label, one, res, smask, tmp;
38029 /* C code for SSE variant we expand below.
38030 double xa = fabs (x), x2;
38031 if (!isless (xa, TWO52))
38032 return x;
38033 xa2 = xa + TWO52 - TWO52;
38034 Compensate:
38035 if (xa2 > xa)
38036 xa2 -= 1.0;
38037 x2 = copysign (xa2, x);
38038 return x2;
38041 TWO52 = ix86_gen_TWO52 (mode);
38043 /* Temporary for holding the result, initialized to the input
38044 operand to ease control flow. */
38045 res = gen_reg_rtx (mode);
38046 emit_move_insn (res, operand1);
38048 /* xa = abs (operand1) */
38049 xa = ix86_expand_sse_fabs (res, &smask);
38051 /* if (!isless (xa, TWO52)) goto label; */
38052 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38054 /* res = xa + TWO52 - TWO52; */
38055 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
38056 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
38057 emit_move_insn (res, tmp);
38059 /* generate 1.0 */
38060 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
38062 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
38063 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
38064 emit_insn (gen_rtx_SET (VOIDmode, mask,
38065 gen_rtx_AND (mode, mask, one)));
38066 tmp = expand_simple_binop (mode, MINUS,
38067 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
38068 emit_move_insn (res, tmp);
38070 /* res = copysign (res, operand1) */
38071 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
38073 emit_label (label);
38074 LABEL_NUSES (label) = 1;
38076 emit_move_insn (operand0, res);
38079 /* Expand SSE sequence for computing round from OPERAND1 storing
38080 into OPERAND0. */
38081 void
38082 ix86_expand_round (rtx operand0, rtx operand1)
38084 /* C code for the stuff we're doing below:
38085 double xa = fabs (x);
38086 if (!isless (xa, TWO52))
38087 return x;
38088 xa = (double)(long)(xa + nextafter (0.5, 0.0));
38089 return copysign (xa, x);
38091 enum machine_mode mode = GET_MODE (operand0);
38092 rtx res, TWO52, xa, label, xi, half, mask;
38093 const struct real_format *fmt;
38094 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
38096 /* Temporary for holding the result, initialized to the input
38097 operand to ease control flow. */
38098 res = gen_reg_rtx (mode);
38099 emit_move_insn (res, operand1);
38101 TWO52 = ix86_gen_TWO52 (mode);
38102 xa = ix86_expand_sse_fabs (res, &mask);
38103 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38105 /* load nextafter (0.5, 0.0) */
38106 fmt = REAL_MODE_FORMAT (mode);
38107 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
38108 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
38110 /* xa = xa + 0.5 */
38111 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
38112 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
38114 /* xa = (double)(int64_t)xa */
38115 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
38116 expand_fix (xi, xa, 0);
38117 expand_float (xa, xi, 0);
38119 /* res = copysign (xa, operand1) */
38120 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
38122 emit_label (label);
38123 LABEL_NUSES (label) = 1;
38125 emit_move_insn (operand0, res);
38128 /* Expand SSE sequence for computing round
38129 from OP1 storing into OP0 using sse4 round insn. */
38130 void
38131 ix86_expand_round_sse4 (rtx op0, rtx op1)
38133 enum machine_mode mode = GET_MODE (op0);
38134 rtx e1, e2, res, half;
38135 const struct real_format *fmt;
38136 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
38137 rtx (*gen_copysign) (rtx, rtx, rtx);
38138 rtx (*gen_round) (rtx, rtx, rtx);
38140 switch (mode)
38142 case SFmode:
38143 gen_copysign = gen_copysignsf3;
38144 gen_round = gen_sse4_1_roundsf2;
38145 break;
38146 case DFmode:
38147 gen_copysign = gen_copysigndf3;
38148 gen_round = gen_sse4_1_rounddf2;
38149 break;
38150 default:
38151 gcc_unreachable ();
38154 /* round (a) = trunc (a + copysign (0.5, a)) */
38156 /* load nextafter (0.5, 0.0) */
38157 fmt = REAL_MODE_FORMAT (mode);
38158 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
38159 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
38160 half = const_double_from_real_value (pred_half, mode);
38162 /* e1 = copysign (0.5, op1) */
38163 e1 = gen_reg_rtx (mode);
38164 emit_insn (gen_copysign (e1, half, op1));
38166 /* e2 = op1 + e1 */
38167 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
38169 /* res = trunc (e2) */
38170 res = gen_reg_rtx (mode);
38171 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
38173 emit_move_insn (op0, res);
38177 /* Table of valid machine attributes. */
38178 static const struct attribute_spec ix86_attribute_table[] =
38180 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
38181 affects_type_identity } */
38182 /* Stdcall attribute says callee is responsible for popping arguments
38183 if they are not variable. */
38184 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38185 true },
38186 /* Fastcall attribute says callee is responsible for popping arguments
38187 if they are not variable. */
38188 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38189 true },
38190 /* Thiscall attribute says callee is responsible for popping arguments
38191 if they are not variable. */
38192 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38193 true },
38194 /* Cdecl attribute says the callee is a normal C declaration */
38195 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38196 true },
38197 /* Regparm attribute specifies how many integer arguments are to be
38198 passed in registers. */
38199 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
38200 true },
38201 /* Sseregparm attribute says we are using x86_64 calling conventions
38202 for FP arguments. */
38203 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38204 true },
38205 /* The transactional memory builtins are implicitly regparm or fastcall
38206 depending on the ABI. Override the generic do-nothing attribute that
38207 these builtins were declared with. */
38208 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
38209 true },
38210 /* force_align_arg_pointer says this function realigns the stack at entry. */
38211 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
38212 false, true, true, ix86_handle_cconv_attribute, false },
38213 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
38214 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
38215 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
38216 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
38217 false },
38218 #endif
38219 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
38220 false },
38221 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
38222 false },
38223 #ifdef SUBTARGET_ATTRIBUTE_TABLE
38224 SUBTARGET_ATTRIBUTE_TABLE,
38225 #endif
38226 /* ms_abi and sysv_abi calling convention function attributes. */
38227 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
38228 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
38229 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
38230 false },
38231 { "callee_pop_aggregate_return", 1, 1, false, true, true,
38232 ix86_handle_callee_pop_aggregate_return, true },
38233 /* End element. */
38234 { NULL, 0, 0, false, false, false, NULL, false }
38237 /* Implement targetm.vectorize.builtin_vectorization_cost. */
38238 static int
38239 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
38240 tree vectype,
38241 int misalign ATTRIBUTE_UNUSED)
38243 unsigned elements;
38245 switch (type_of_cost)
38247 case scalar_stmt:
38248 return ix86_cost->scalar_stmt_cost;
38250 case scalar_load:
38251 return ix86_cost->scalar_load_cost;
38253 case scalar_store:
38254 return ix86_cost->scalar_store_cost;
38256 case vector_stmt:
38257 return ix86_cost->vec_stmt_cost;
38259 case vector_load:
38260 return ix86_cost->vec_align_load_cost;
38262 case vector_store:
38263 return ix86_cost->vec_store_cost;
38265 case vec_to_scalar:
38266 return ix86_cost->vec_to_scalar_cost;
38268 case scalar_to_vec:
38269 return ix86_cost->scalar_to_vec_cost;
38271 case unaligned_load:
38272 case unaligned_store:
38273 return ix86_cost->vec_unalign_load_cost;
38275 case cond_branch_taken:
38276 return ix86_cost->cond_taken_branch_cost;
38278 case cond_branch_not_taken:
38279 return ix86_cost->cond_not_taken_branch_cost;
38281 case vec_perm:
38282 case vec_promote_demote:
38283 return ix86_cost->vec_stmt_cost;
38285 case vec_construct:
38286 elements = TYPE_VECTOR_SUBPARTS (vectype);
38287 return elements / 2 + 1;
38289 default:
38290 gcc_unreachable ();
38294 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
38295 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
38296 insn every time. */
38298 static GTY(()) rtx vselect_insn;
38300 /* Initialize vselect_insn. */
38302 static void
38303 init_vselect_insn (void)
38305 unsigned i;
38306 rtx x;
38308 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
38309 for (i = 0; i < MAX_VECT_LEN; ++i)
38310 XVECEXP (x, 0, i) = const0_rtx;
38311 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
38312 const0_rtx), x);
38313 x = gen_rtx_SET (VOIDmode, const0_rtx, x);
38314 start_sequence ();
38315 vselect_insn = emit_insn (x);
38316 end_sequence ();
38319 /* Construct (set target (vec_select op0 (parallel perm))) and
38320 return true if that's a valid instruction in the active ISA. */
38322 static bool
38323 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
38324 unsigned nelt, bool testing_p)
38326 unsigned int i;
38327 rtx x, save_vconcat;
38328 int icode;
38330 if (vselect_insn == NULL_RTX)
38331 init_vselect_insn ();
38333 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
38334 PUT_NUM_ELEM (XVEC (x, 0), nelt);
38335 for (i = 0; i < nelt; ++i)
38336 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
38337 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
38338 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
38339 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
38340 SET_DEST (PATTERN (vselect_insn)) = target;
38341 icode = recog_memoized (vselect_insn);
38343 if (icode >= 0 && !testing_p)
38344 emit_insn (copy_rtx (PATTERN (vselect_insn)));
38346 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
38347 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
38348 INSN_CODE (vselect_insn) = -1;
38350 return icode >= 0;
38353 /* Similar, but generate a vec_concat from op0 and op1 as well. */
38355 static bool
38356 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
38357 const unsigned char *perm, unsigned nelt,
38358 bool testing_p)
38360 enum machine_mode v2mode;
38361 rtx x;
38362 bool ok;
38364 if (vselect_insn == NULL_RTX)
38365 init_vselect_insn ();
38367 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
38368 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
38369 PUT_MODE (x, v2mode);
38370 XEXP (x, 0) = op0;
38371 XEXP (x, 1) = op1;
38372 ok = expand_vselect (target, x, perm, nelt, testing_p);
38373 XEXP (x, 0) = const0_rtx;
38374 XEXP (x, 1) = const0_rtx;
38375 return ok;
38378 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
38379 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
38381 static bool
38382 expand_vec_perm_blend (struct expand_vec_perm_d *d)
38384 enum machine_mode vmode = d->vmode;
38385 unsigned i, mask, nelt = d->nelt;
38386 rtx target, op0, op1, x;
38387 rtx rperm[32], vperm;
38389 if (d->one_operand_p)
38390 return false;
38391 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
38393 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
38395 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
38397 else
38398 return false;
38400 /* This is a blend, not a permute. Elements must stay in their
38401 respective lanes. */
38402 for (i = 0; i < nelt; ++i)
38404 unsigned e = d->perm[i];
38405 if (!(e == i || e == i + nelt))
38406 return false;
38409 if (d->testing_p)
38410 return true;
38412 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
38413 decision should be extracted elsewhere, so that we only try that
38414 sequence once all budget==3 options have been tried. */
38415 target = d->target;
38416 op0 = d->op0;
38417 op1 = d->op1;
38418 mask = 0;
38420 switch (vmode)
38422 case V4DFmode:
38423 case V8SFmode:
38424 case V2DFmode:
38425 case V4SFmode:
38426 case V8HImode:
38427 case V8SImode:
38428 for (i = 0; i < nelt; ++i)
38429 mask |= (d->perm[i] >= nelt) << i;
38430 break;
38432 case V2DImode:
38433 for (i = 0; i < 2; ++i)
38434 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
38435 vmode = V8HImode;
38436 goto do_subreg;
38438 case V4SImode:
38439 for (i = 0; i < 4; ++i)
38440 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
38441 vmode = V8HImode;
38442 goto do_subreg;
38444 case V16QImode:
38445 /* See if bytes move in pairs so we can use pblendw with
38446 an immediate argument, rather than pblendvb with a vector
38447 argument. */
38448 for (i = 0; i < 16; i += 2)
38449 if (d->perm[i] + 1 != d->perm[i + 1])
38451 use_pblendvb:
38452 for (i = 0; i < nelt; ++i)
38453 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
38455 finish_pblendvb:
38456 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
38457 vperm = force_reg (vmode, vperm);
38459 if (GET_MODE_SIZE (vmode) == 16)
38460 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
38461 else
38462 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
38463 return true;
38466 for (i = 0; i < 8; ++i)
38467 mask |= (d->perm[i * 2] >= 16) << i;
38468 vmode = V8HImode;
38469 /* FALLTHRU */
38471 do_subreg:
38472 target = gen_lowpart (vmode, target);
38473 op0 = gen_lowpart (vmode, op0);
38474 op1 = gen_lowpart (vmode, op1);
38475 break;
38477 case V32QImode:
38478 /* See if bytes move in pairs. If not, vpblendvb must be used. */
38479 for (i = 0; i < 32; i += 2)
38480 if (d->perm[i] + 1 != d->perm[i + 1])
38481 goto use_pblendvb;
38482 /* See if bytes move in quadruplets. If yes, vpblendd
38483 with immediate can be used. */
38484 for (i = 0; i < 32; i += 4)
38485 if (d->perm[i] + 2 != d->perm[i + 2])
38486 break;
38487 if (i < 32)
38489 /* See if bytes move the same in both lanes. If yes,
38490 vpblendw with immediate can be used. */
38491 for (i = 0; i < 16; i += 2)
38492 if (d->perm[i] + 16 != d->perm[i + 16])
38493 goto use_pblendvb;
38495 /* Use vpblendw. */
38496 for (i = 0; i < 16; ++i)
38497 mask |= (d->perm[i * 2] >= 32) << i;
38498 vmode = V16HImode;
38499 goto do_subreg;
38502 /* Use vpblendd. */
38503 for (i = 0; i < 8; ++i)
38504 mask |= (d->perm[i * 4] >= 32) << i;
38505 vmode = V8SImode;
38506 goto do_subreg;
38508 case V16HImode:
38509 /* See if words move in pairs. If yes, vpblendd can be used. */
38510 for (i = 0; i < 16; i += 2)
38511 if (d->perm[i] + 1 != d->perm[i + 1])
38512 break;
38513 if (i < 16)
38515 /* See if words move the same in both lanes. If not,
38516 vpblendvb must be used. */
38517 for (i = 0; i < 8; i++)
38518 if (d->perm[i] + 8 != d->perm[i + 8])
38520 /* Use vpblendvb. */
38521 for (i = 0; i < 32; ++i)
38522 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
38524 vmode = V32QImode;
38525 nelt = 32;
38526 target = gen_lowpart (vmode, target);
38527 op0 = gen_lowpart (vmode, op0);
38528 op1 = gen_lowpart (vmode, op1);
38529 goto finish_pblendvb;
38532 /* Use vpblendw. */
38533 for (i = 0; i < 16; ++i)
38534 mask |= (d->perm[i] >= 16) << i;
38535 break;
38538 /* Use vpblendd. */
38539 for (i = 0; i < 8; ++i)
38540 mask |= (d->perm[i * 2] >= 16) << i;
38541 vmode = V8SImode;
38542 goto do_subreg;
38544 case V4DImode:
38545 /* Use vpblendd. */
38546 for (i = 0; i < 4; ++i)
38547 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
38548 vmode = V8SImode;
38549 goto do_subreg;
38551 default:
38552 gcc_unreachable ();
38555 /* This matches five different patterns with the different modes. */
38556 x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
38557 x = gen_rtx_SET (VOIDmode, target, x);
38558 emit_insn (x);
38560 return true;
38563 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
38564 in terms of the variable form of vpermilps.
38566 Note that we will have already failed the immediate input vpermilps,
38567 which requires that the high and low part shuffle be identical; the
38568 variable form doesn't require that. */
38570 static bool
38571 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
38573 rtx rperm[8], vperm;
38574 unsigned i;
38576 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
38577 return false;
38579 /* We can only permute within the 128-bit lane. */
38580 for (i = 0; i < 8; ++i)
38582 unsigned e = d->perm[i];
38583 if (i < 4 ? e >= 4 : e < 4)
38584 return false;
38587 if (d->testing_p)
38588 return true;
38590 for (i = 0; i < 8; ++i)
38592 unsigned e = d->perm[i];
38594 /* Within each 128-bit lane, the elements of op0 are numbered
38595 from 0 and the elements of op1 are numbered from 4. */
38596 if (e >= 8 + 4)
38597 e -= 8;
38598 else if (e >= 4)
38599 e -= 4;
38601 rperm[i] = GEN_INT (e);
38604 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
38605 vperm = force_reg (V8SImode, vperm);
38606 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
38608 return true;
38611 /* Return true if permutation D can be performed as VMODE permutation
38612 instead. */
38614 static bool
38615 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
38617 unsigned int i, j, chunk;
38619 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
38620 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
38621 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
38622 return false;
38624 if (GET_MODE_NUNITS (vmode) >= d->nelt)
38625 return true;
38627 chunk = d->nelt / GET_MODE_NUNITS (vmode);
38628 for (i = 0; i < d->nelt; i += chunk)
38629 if (d->perm[i] & (chunk - 1))
38630 return false;
38631 else
38632 for (j = 1; j < chunk; ++j)
38633 if (d->perm[i] + j != d->perm[i + j])
38634 return false;
38636 return true;
38639 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
38640 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
38642 static bool
38643 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
38645 unsigned i, nelt, eltsz, mask;
38646 unsigned char perm[32];
38647 enum machine_mode vmode = V16QImode;
38648 rtx rperm[32], vperm, target, op0, op1;
38650 nelt = d->nelt;
38652 if (!d->one_operand_p)
38654 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
38656 if (TARGET_AVX2
38657 && valid_perm_using_mode_p (V2TImode, d))
38659 if (d->testing_p)
38660 return true;
38662 /* Use vperm2i128 insn. The pattern uses
38663 V4DImode instead of V2TImode. */
38664 target = gen_lowpart (V4DImode, d->target);
38665 op0 = gen_lowpart (V4DImode, d->op0);
38666 op1 = gen_lowpart (V4DImode, d->op1);
38667 rperm[0]
38668 = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
38669 || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
38670 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
38671 return true;
38673 return false;
38676 else
38678 if (GET_MODE_SIZE (d->vmode) == 16)
38680 if (!TARGET_SSSE3)
38681 return false;
38683 else if (GET_MODE_SIZE (d->vmode) == 32)
38685 if (!TARGET_AVX2)
38686 return false;
38688 /* V4DImode should be already handled through
38689 expand_vselect by vpermq instruction. */
38690 gcc_assert (d->vmode != V4DImode);
38692 vmode = V32QImode;
38693 if (d->vmode == V8SImode
38694 || d->vmode == V16HImode
38695 || d->vmode == V32QImode)
38697 /* First see if vpermq can be used for
38698 V8SImode/V16HImode/V32QImode. */
38699 if (valid_perm_using_mode_p (V4DImode, d))
38701 for (i = 0; i < 4; i++)
38702 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
38703 if (d->testing_p)
38704 return true;
38705 return expand_vselect (gen_lowpart (V4DImode, d->target),
38706 gen_lowpart (V4DImode, d->op0),
38707 perm, 4, false);
38710 /* Next see if vpermd can be used. */
38711 if (valid_perm_using_mode_p (V8SImode, d))
38712 vmode = V8SImode;
38714 /* Or if vpermps can be used. */
38715 else if (d->vmode == V8SFmode)
38716 vmode = V8SImode;
38718 if (vmode == V32QImode)
38720 /* vpshufb only works intra lanes, it is not
38721 possible to shuffle bytes in between the lanes. */
38722 for (i = 0; i < nelt; ++i)
38723 if ((d->perm[i] ^ i) & (nelt / 2))
38724 return false;
38727 else
38728 return false;
38731 if (d->testing_p)
38732 return true;
38734 if (vmode == V8SImode)
38735 for (i = 0; i < 8; ++i)
38736 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
38737 else
38739 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
38740 if (!d->one_operand_p)
38741 mask = 2 * nelt - 1;
38742 else if (vmode == V16QImode)
38743 mask = nelt - 1;
38744 else
38745 mask = nelt / 2 - 1;
38747 for (i = 0; i < nelt; ++i)
38749 unsigned j, e = d->perm[i] & mask;
38750 for (j = 0; j < eltsz; ++j)
38751 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
38755 vperm = gen_rtx_CONST_VECTOR (vmode,
38756 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
38757 vperm = force_reg (vmode, vperm);
38759 target = gen_lowpart (vmode, d->target);
38760 op0 = gen_lowpart (vmode, d->op0);
38761 if (d->one_operand_p)
38763 if (vmode == V16QImode)
38764 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
38765 else if (vmode == V32QImode)
38766 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
38767 else if (vmode == V8SFmode)
38768 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
38769 else
38770 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
38772 else
38774 op1 = gen_lowpart (vmode, d->op1);
38775 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
38778 return true;
38781 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
38782 in a single instruction. */
38784 static bool
38785 expand_vec_perm_1 (struct expand_vec_perm_d *d)
38787 unsigned i, nelt = d->nelt;
38788 unsigned char perm2[MAX_VECT_LEN];
38790 /* Check plain VEC_SELECT first, because AVX has instructions that could
38791 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
38792 input where SEL+CONCAT may not. */
38793 if (d->one_operand_p)
38795 int mask = nelt - 1;
38796 bool identity_perm = true;
38797 bool broadcast_perm = true;
38799 for (i = 0; i < nelt; i++)
38801 perm2[i] = d->perm[i] & mask;
38802 if (perm2[i] != i)
38803 identity_perm = false;
38804 if (perm2[i])
38805 broadcast_perm = false;
38808 if (identity_perm)
38810 if (!d->testing_p)
38811 emit_move_insn (d->target, d->op0);
38812 return true;
38814 else if (broadcast_perm && TARGET_AVX2)
38816 /* Use vpbroadcast{b,w,d}. */
38817 rtx (*gen) (rtx, rtx) = NULL;
38818 switch (d->vmode)
38820 case V32QImode:
38821 gen = gen_avx2_pbroadcastv32qi_1;
38822 break;
38823 case V16HImode:
38824 gen = gen_avx2_pbroadcastv16hi_1;
38825 break;
38826 case V8SImode:
38827 gen = gen_avx2_pbroadcastv8si_1;
38828 break;
38829 case V16QImode:
38830 gen = gen_avx2_pbroadcastv16qi;
38831 break;
38832 case V8HImode:
38833 gen = gen_avx2_pbroadcastv8hi;
38834 break;
38835 case V8SFmode:
38836 gen = gen_avx2_vec_dupv8sf_1;
38837 break;
38838 /* For other modes prefer other shuffles this function creates. */
38839 default: break;
38841 if (gen != NULL)
38843 if (!d->testing_p)
38844 emit_insn (gen (d->target, d->op0));
38845 return true;
38849 if (expand_vselect (d->target, d->op0, perm2, nelt, d->testing_p))
38850 return true;
38852 /* There are plenty of patterns in sse.md that are written for
38853 SEL+CONCAT and are not replicated for a single op. Perhaps
38854 that should be changed, to avoid the nastiness here. */
38856 /* Recognize interleave style patterns, which means incrementing
38857 every other permutation operand. */
38858 for (i = 0; i < nelt; i += 2)
38860 perm2[i] = d->perm[i] & mask;
38861 perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
38863 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
38864 d->testing_p))
38865 return true;
38867 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
38868 if (nelt >= 4)
38870 for (i = 0; i < nelt; i += 4)
38872 perm2[i + 0] = d->perm[i + 0] & mask;
38873 perm2[i + 1] = d->perm[i + 1] & mask;
38874 perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
38875 perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
38878 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
38879 d->testing_p))
38880 return true;
38884 /* Finally, try the fully general two operand permute. */
38885 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
38886 d->testing_p))
38887 return true;
38889 /* Recognize interleave style patterns with reversed operands. */
38890 if (!d->one_operand_p)
38892 for (i = 0; i < nelt; ++i)
38894 unsigned e = d->perm[i];
38895 if (e >= nelt)
38896 e -= nelt;
38897 else
38898 e += nelt;
38899 perm2[i] = e;
38902 if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt,
38903 d->testing_p))
38904 return true;
38907 /* Try the SSE4.1 blend variable merge instructions. */
38908 if (expand_vec_perm_blend (d))
38909 return true;
38911 /* Try one of the AVX vpermil variable permutations. */
38912 if (expand_vec_perm_vpermil (d))
38913 return true;
38915 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
38916 vpshufb, vpermd, vpermps or vpermq variable permutation. */
38917 if (expand_vec_perm_pshufb (d))
38918 return true;
38920 return false;
38923 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
38924 in terms of a pair of pshuflw + pshufhw instructions. */
38926 static bool
38927 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
38929 unsigned char perm2[MAX_VECT_LEN];
38930 unsigned i;
38931 bool ok;
38933 if (d->vmode != V8HImode || !d->one_operand_p)
38934 return false;
38936 /* The two permutations only operate in 64-bit lanes. */
38937 for (i = 0; i < 4; ++i)
38938 if (d->perm[i] >= 4)
38939 return false;
38940 for (i = 4; i < 8; ++i)
38941 if (d->perm[i] < 4)
38942 return false;
38944 if (d->testing_p)
38945 return true;
38947 /* Emit the pshuflw. */
38948 memcpy (perm2, d->perm, 4);
38949 for (i = 4; i < 8; ++i)
38950 perm2[i] = i;
38951 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
38952 gcc_assert (ok);
38954 /* Emit the pshufhw. */
38955 memcpy (perm2 + 4, d->perm + 4, 4);
38956 for (i = 0; i < 4; ++i)
38957 perm2[i] = i;
38958 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
38959 gcc_assert (ok);
38961 return true;
38964 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
38965 the permutation using the SSSE3 palignr instruction. This succeeds
38966 when all of the elements in PERM fit within one vector and we merely
38967 need to shift them down so that a single vector permutation has a
38968 chance to succeed. */
38970 static bool
38971 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
38973 unsigned i, nelt = d->nelt;
38974 unsigned min, max;
38975 bool in_order, ok;
38976 rtx shift;
38978 /* Even with AVX, palignr only operates on 128-bit vectors. */
38979 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
38980 return false;
38982 min = nelt, max = 0;
38983 for (i = 0; i < nelt; ++i)
38985 unsigned e = d->perm[i];
38986 if (e < min)
38987 min = e;
38988 if (e > max)
38989 max = e;
38991 if (min == 0 || max - min >= nelt)
38992 return false;
38994 /* Given that we have SSSE3, we know we'll be able to implement the
38995 single operand permutation after the palignr with pshufb. */
38996 if (d->testing_p)
38997 return true;
38999 shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
39000 emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target),
39001 gen_lowpart (TImode, d->op1),
39002 gen_lowpart (TImode, d->op0), shift));
39004 d->op0 = d->op1 = d->target;
39005 d->one_operand_p = true;
39007 in_order = true;
39008 for (i = 0; i < nelt; ++i)
39010 unsigned e = d->perm[i] - min;
39011 if (e != i)
39012 in_order = false;
39013 d->perm[i] = e;
39016 /* Test for the degenerate case where the alignment by itself
39017 produces the desired permutation. */
39018 if (in_order)
39019 return true;
39021 ok = expand_vec_perm_1 (d);
39022 gcc_assert (ok);
39024 return ok;
39027 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
39029 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
39030 a two vector permutation into a single vector permutation by using
39031 an interleave operation to merge the vectors. */
39033 static bool
39034 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
39036 struct expand_vec_perm_d dremap, dfinal;
39037 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
39038 unsigned HOST_WIDE_INT contents;
39039 unsigned char remap[2 * MAX_VECT_LEN];
39040 rtx seq;
39041 bool ok, same_halves = false;
39043 if (GET_MODE_SIZE (d->vmode) == 16)
39045 if (d->one_operand_p)
39046 return false;
39048 else if (GET_MODE_SIZE (d->vmode) == 32)
39050 if (!TARGET_AVX)
39051 return false;
39052 /* For 32-byte modes allow even d->one_operand_p.
39053 The lack of cross-lane shuffling in some instructions
39054 might prevent a single insn shuffle. */
39055 dfinal = *d;
39056 dfinal.testing_p = true;
39057 /* If expand_vec_perm_interleave3 can expand this into
39058 a 3 insn sequence, give up and let it be expanded as
39059 3 insn sequence. While that is one insn longer,
39060 it doesn't need a memory operand and in the common
39061 case that both interleave low and high permutations
39062 with the same operands are adjacent needs 4 insns
39063 for both after CSE. */
39064 if (expand_vec_perm_interleave3 (&dfinal))
39065 return false;
39067 else
39068 return false;
39070 /* Examine from whence the elements come. */
39071 contents = 0;
39072 for (i = 0; i < nelt; ++i)
39073 contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
39075 memset (remap, 0xff, sizeof (remap));
39076 dremap = *d;
39078 if (GET_MODE_SIZE (d->vmode) == 16)
39080 unsigned HOST_WIDE_INT h1, h2, h3, h4;
39082 /* Split the two input vectors into 4 halves. */
39083 h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
39084 h2 = h1 << nelt2;
39085 h3 = h2 << nelt2;
39086 h4 = h3 << nelt2;
39088 /* If the elements from the low halves use interleave low, and similarly
39089 for interleave high. If the elements are from mis-matched halves, we
39090 can use shufps for V4SF/V4SI or do a DImode shuffle. */
39091 if ((contents & (h1 | h3)) == contents)
39093 /* punpckl* */
39094 for (i = 0; i < nelt2; ++i)
39096 remap[i] = i * 2;
39097 remap[i + nelt] = i * 2 + 1;
39098 dremap.perm[i * 2] = i;
39099 dremap.perm[i * 2 + 1] = i + nelt;
39101 if (!TARGET_SSE2 && d->vmode == V4SImode)
39102 dremap.vmode = V4SFmode;
39104 else if ((contents & (h2 | h4)) == contents)
39106 /* punpckh* */
39107 for (i = 0; i < nelt2; ++i)
39109 remap[i + nelt2] = i * 2;
39110 remap[i + nelt + nelt2] = i * 2 + 1;
39111 dremap.perm[i * 2] = i + nelt2;
39112 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
39114 if (!TARGET_SSE2 && d->vmode == V4SImode)
39115 dremap.vmode = V4SFmode;
39117 else if ((contents & (h1 | h4)) == contents)
39119 /* shufps */
39120 for (i = 0; i < nelt2; ++i)
39122 remap[i] = i;
39123 remap[i + nelt + nelt2] = i + nelt2;
39124 dremap.perm[i] = i;
39125 dremap.perm[i + nelt2] = i + nelt + nelt2;
39127 if (nelt != 4)
39129 /* shufpd */
39130 dremap.vmode = V2DImode;
39131 dremap.nelt = 2;
39132 dremap.perm[0] = 0;
39133 dremap.perm[1] = 3;
39136 else if ((contents & (h2 | h3)) == contents)
39138 /* shufps */
39139 for (i = 0; i < nelt2; ++i)
39141 remap[i + nelt2] = i;
39142 remap[i + nelt] = i + nelt2;
39143 dremap.perm[i] = i + nelt2;
39144 dremap.perm[i + nelt2] = i + nelt;
39146 if (nelt != 4)
39148 /* shufpd */
39149 dremap.vmode = V2DImode;
39150 dremap.nelt = 2;
39151 dremap.perm[0] = 1;
39152 dremap.perm[1] = 2;
39155 else
39156 return false;
39158 else
39160 unsigned int nelt4 = nelt / 4, nzcnt = 0;
39161 unsigned HOST_WIDE_INT q[8];
39162 unsigned int nonzero_halves[4];
39164 /* Split the two input vectors into 8 quarters. */
39165 q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
39166 for (i = 1; i < 8; ++i)
39167 q[i] = q[0] << (nelt4 * i);
39168 for (i = 0; i < 4; ++i)
39169 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
39171 nonzero_halves[nzcnt] = i;
39172 ++nzcnt;
39175 if (nzcnt == 1)
39177 gcc_assert (d->one_operand_p);
39178 nonzero_halves[1] = nonzero_halves[0];
39179 same_halves = true;
39181 else if (d->one_operand_p)
39183 gcc_assert (nonzero_halves[0] == 0);
39184 gcc_assert (nonzero_halves[1] == 1);
39187 if (nzcnt <= 2)
39189 if (d->perm[0] / nelt2 == nonzero_halves[1])
39191 /* Attempt to increase the likelihood that dfinal
39192 shuffle will be intra-lane. */
39193 char tmph = nonzero_halves[0];
39194 nonzero_halves[0] = nonzero_halves[1];
39195 nonzero_halves[1] = tmph;
39198 /* vperm2f128 or vperm2i128. */
39199 for (i = 0; i < nelt2; ++i)
39201 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
39202 remap[i + nonzero_halves[0] * nelt2] = i;
39203 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
39204 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
39207 if (d->vmode != V8SFmode
39208 && d->vmode != V4DFmode
39209 && d->vmode != V8SImode)
39211 dremap.vmode = V8SImode;
39212 dremap.nelt = 8;
39213 for (i = 0; i < 4; ++i)
39215 dremap.perm[i] = i + nonzero_halves[0] * 4;
39216 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
39220 else if (d->one_operand_p)
39221 return false;
39222 else if (TARGET_AVX2
39223 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
39225 /* vpunpckl* */
39226 for (i = 0; i < nelt4; ++i)
39228 remap[i] = i * 2;
39229 remap[i + nelt] = i * 2 + 1;
39230 remap[i + nelt2] = i * 2 + nelt2;
39231 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
39232 dremap.perm[i * 2] = i;
39233 dremap.perm[i * 2 + 1] = i + nelt;
39234 dremap.perm[i * 2 + nelt2] = i + nelt2;
39235 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
39238 else if (TARGET_AVX2
39239 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
39241 /* vpunpckh* */
39242 for (i = 0; i < nelt4; ++i)
39244 remap[i + nelt4] = i * 2;
39245 remap[i + nelt + nelt4] = i * 2 + 1;
39246 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
39247 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
39248 dremap.perm[i * 2] = i + nelt4;
39249 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
39250 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
39251 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
39254 else
39255 return false;
39258 /* Use the remapping array set up above to move the elements from their
39259 swizzled locations into their final destinations. */
39260 dfinal = *d;
39261 for (i = 0; i < nelt; ++i)
39263 unsigned e = remap[d->perm[i]];
39264 gcc_assert (e < nelt);
39265 /* If same_halves is true, both halves of the remapped vector are the
39266 same. Avoid cross-lane accesses if possible. */
39267 if (same_halves && i >= nelt2)
39269 gcc_assert (e < nelt2);
39270 dfinal.perm[i] = e + nelt2;
39272 else
39273 dfinal.perm[i] = e;
39275 dfinal.op0 = gen_reg_rtx (dfinal.vmode);
39276 dfinal.op1 = dfinal.op0;
39277 dfinal.one_operand_p = true;
39278 dremap.target = dfinal.op0;
39280 /* Test if the final remap can be done with a single insn. For V4SFmode or
39281 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
39282 start_sequence ();
39283 ok = expand_vec_perm_1 (&dfinal);
39284 seq = get_insns ();
39285 end_sequence ();
39287 if (!ok)
39288 return false;
39290 if (d->testing_p)
39291 return true;
39293 if (dremap.vmode != dfinal.vmode)
39295 dremap.target = gen_lowpart (dremap.vmode, dremap.target);
39296 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
39297 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
39300 ok = expand_vec_perm_1 (&dremap);
39301 gcc_assert (ok);
39303 emit_insn (seq);
39304 return true;
39307 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
39308 a single vector cross-lane permutation into vpermq followed
39309 by any of the single insn permutations. */
39311 static bool
39312 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
39314 struct expand_vec_perm_d dremap, dfinal;
39315 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
39316 unsigned contents[2];
39317 bool ok;
39319 if (!(TARGET_AVX2
39320 && (d->vmode == V32QImode || d->vmode == V16HImode)
39321 && d->one_operand_p))
39322 return false;
39324 contents[0] = 0;
39325 contents[1] = 0;
39326 for (i = 0; i < nelt2; ++i)
39328 contents[0] |= 1u << (d->perm[i] / nelt4);
39329 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
39332 for (i = 0; i < 2; ++i)
39334 unsigned int cnt = 0;
39335 for (j = 0; j < 4; ++j)
39336 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
39337 return false;
39340 if (d->testing_p)
39341 return true;
39343 dremap = *d;
39344 dremap.vmode = V4DImode;
39345 dremap.nelt = 4;
39346 dremap.target = gen_reg_rtx (V4DImode);
39347 dremap.op0 = gen_lowpart (V4DImode, d->op0);
39348 dremap.op1 = dremap.op0;
39349 dremap.one_operand_p = true;
39350 for (i = 0; i < 2; ++i)
39352 unsigned int cnt = 0;
39353 for (j = 0; j < 4; ++j)
39354 if ((contents[i] & (1u << j)) != 0)
39355 dremap.perm[2 * i + cnt++] = j;
39356 for (; cnt < 2; ++cnt)
39357 dremap.perm[2 * i + cnt] = 0;
39360 dfinal = *d;
39361 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
39362 dfinal.op1 = dfinal.op0;
39363 dfinal.one_operand_p = true;
39364 for (i = 0, j = 0; i < nelt; ++i)
39366 if (i == nelt2)
39367 j = 2;
39368 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
39369 if ((d->perm[i] / nelt4) == dremap.perm[j])
39371 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
39372 dfinal.perm[i] |= nelt4;
39373 else
39374 gcc_unreachable ();
39377 ok = expand_vec_perm_1 (&dremap);
39378 gcc_assert (ok);
39380 ok = expand_vec_perm_1 (&dfinal);
39381 gcc_assert (ok);
39383 return true;
39386 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
39387 a vector permutation using two instructions, vperm2f128 resp.
39388 vperm2i128 followed by any single in-lane permutation. */
39390 static bool
39391 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
39393 struct expand_vec_perm_d dfirst, dsecond;
39394 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
39395 bool ok;
39397 if (!TARGET_AVX
39398 || GET_MODE_SIZE (d->vmode) != 32
39399 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
39400 return false;
39402 dsecond = *d;
39403 dsecond.one_operand_p = false;
39404 dsecond.testing_p = true;
39406 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
39407 immediate. For perm < 16 the second permutation uses
39408 d->op0 as first operand, for perm >= 16 it uses d->op1
39409 as first operand. The second operand is the result of
39410 vperm2[fi]128. */
39411 for (perm = 0; perm < 32; perm++)
39413 /* Ignore permutations which do not move anything cross-lane. */
39414 if (perm < 16)
39416 /* The second shuffle for e.g. V4DFmode has
39417 0123 and ABCD operands.
39418 Ignore AB23, as 23 is already in the second lane
39419 of the first operand. */
39420 if ((perm & 0xc) == (1 << 2)) continue;
39421 /* And 01CD, as 01 is in the first lane of the first
39422 operand. */
39423 if ((perm & 3) == 0) continue;
39424 /* And 4567, as then the vperm2[fi]128 doesn't change
39425 anything on the original 4567 second operand. */
39426 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
39428 else
39430 /* The second shuffle for e.g. V4DFmode has
39431 4567 and ABCD operands.
39432 Ignore AB67, as 67 is already in the second lane
39433 of the first operand. */
39434 if ((perm & 0xc) == (3 << 2)) continue;
39435 /* And 45CD, as 45 is in the first lane of the first
39436 operand. */
39437 if ((perm & 3) == 2) continue;
39438 /* And 0123, as then the vperm2[fi]128 doesn't change
39439 anything on the original 0123 first operand. */
39440 if ((perm & 0xf) == (1 << 2)) continue;
39443 for (i = 0; i < nelt; i++)
39445 j = d->perm[i] / nelt2;
39446 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
39447 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
39448 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
39449 dsecond.perm[i] = d->perm[i] & (nelt - 1);
39450 else
39451 break;
39454 if (i == nelt)
39456 start_sequence ();
39457 ok = expand_vec_perm_1 (&dsecond);
39458 end_sequence ();
39460 else
39461 ok = false;
39463 if (ok)
39465 if (d->testing_p)
39466 return true;
39468 /* Found a usable second shuffle. dfirst will be
39469 vperm2f128 on d->op0 and d->op1. */
39470 dsecond.testing_p = false;
39471 dfirst = *d;
39472 dfirst.target = gen_reg_rtx (d->vmode);
39473 for (i = 0; i < nelt; i++)
39474 dfirst.perm[i] = (i & (nelt2 - 1))
39475 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
39477 ok = expand_vec_perm_1 (&dfirst);
39478 gcc_assert (ok);
39480 /* And dsecond is some single insn shuffle, taking
39481 d->op0 and result of vperm2f128 (if perm < 16) or
39482 d->op1 and result of vperm2f128 (otherwise). */
39483 dsecond.op1 = dfirst.target;
39484 if (perm >= 16)
39485 dsecond.op0 = dfirst.op1;
39487 ok = expand_vec_perm_1 (&dsecond);
39488 gcc_assert (ok);
39490 return true;
39493 /* For one operand, the only useful vperm2f128 permutation is 0x10. */
39494 if (d->one_operand_p)
39495 return false;
39498 return false;
39501 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
39502 a two vector permutation using 2 intra-lane interleave insns
39503 and cross-lane shuffle for 32-byte vectors. */
39505 static bool
39506 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
39508 unsigned i, nelt;
39509 rtx (*gen) (rtx, rtx, rtx);
39511 if (d->one_operand_p)
39512 return false;
39513 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
39515 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
39517 else
39518 return false;
39520 nelt = d->nelt;
39521 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
39522 return false;
39523 for (i = 0; i < nelt; i += 2)
39524 if (d->perm[i] != d->perm[0] + i / 2
39525 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
39526 return false;
39528 if (d->testing_p)
39529 return true;
39531 switch (d->vmode)
39533 case V32QImode:
39534 if (d->perm[0])
39535 gen = gen_vec_interleave_highv32qi;
39536 else
39537 gen = gen_vec_interleave_lowv32qi;
39538 break;
39539 case V16HImode:
39540 if (d->perm[0])
39541 gen = gen_vec_interleave_highv16hi;
39542 else
39543 gen = gen_vec_interleave_lowv16hi;
39544 break;
39545 case V8SImode:
39546 if (d->perm[0])
39547 gen = gen_vec_interleave_highv8si;
39548 else
39549 gen = gen_vec_interleave_lowv8si;
39550 break;
39551 case V4DImode:
39552 if (d->perm[0])
39553 gen = gen_vec_interleave_highv4di;
39554 else
39555 gen = gen_vec_interleave_lowv4di;
39556 break;
39557 case V8SFmode:
39558 if (d->perm[0])
39559 gen = gen_vec_interleave_highv8sf;
39560 else
39561 gen = gen_vec_interleave_lowv8sf;
39562 break;
39563 case V4DFmode:
39564 if (d->perm[0])
39565 gen = gen_vec_interleave_highv4df;
39566 else
39567 gen = gen_vec_interleave_lowv4df;
39568 break;
39569 default:
39570 gcc_unreachable ();
39573 emit_insn (gen (d->target, d->op0, d->op1));
39574 return true;
39577 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
39578 a single vector permutation using a single intra-lane vector
39579 permutation, vperm2f128 swapping the lanes and vblend* insn blending
39580 the non-swapped and swapped vectors together. */
39582 static bool
39583 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
39585 struct expand_vec_perm_d dfirst, dsecond;
39586 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
39587 rtx seq;
39588 bool ok;
39589 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
39591 if (!TARGET_AVX
39592 || TARGET_AVX2
39593 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
39594 || !d->one_operand_p)
39595 return false;
39597 dfirst = *d;
39598 for (i = 0; i < nelt; i++)
39599 dfirst.perm[i] = 0xff;
39600 for (i = 0, msk = 0; i < nelt; i++)
39602 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
39603 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
39604 return false;
39605 dfirst.perm[j] = d->perm[i];
39606 if (j != i)
39607 msk |= (1 << i);
39609 for (i = 0; i < nelt; i++)
39610 if (dfirst.perm[i] == 0xff)
39611 dfirst.perm[i] = i;
39613 if (!d->testing_p)
39614 dfirst.target = gen_reg_rtx (dfirst.vmode);
39616 start_sequence ();
39617 ok = expand_vec_perm_1 (&dfirst);
39618 seq = get_insns ();
39619 end_sequence ();
39621 if (!ok)
39622 return false;
39624 if (d->testing_p)
39625 return true;
39627 emit_insn (seq);
39629 dsecond = *d;
39630 dsecond.op0 = dfirst.target;
39631 dsecond.op1 = dfirst.target;
39632 dsecond.one_operand_p = true;
39633 dsecond.target = gen_reg_rtx (dsecond.vmode);
39634 for (i = 0; i < nelt; i++)
39635 dsecond.perm[i] = i ^ nelt2;
39637 ok = expand_vec_perm_1 (&dsecond);
39638 gcc_assert (ok);
39640 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
39641 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
39642 return true;
39645 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
39646 permutation using two vperm2f128, followed by a vshufpd insn blending
39647 the two vectors together. */
39649 static bool
39650 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
39652 struct expand_vec_perm_d dfirst, dsecond, dthird;
39653 bool ok;
39655 if (!TARGET_AVX || (d->vmode != V4DFmode))
39656 return false;
39658 if (d->testing_p)
39659 return true;
39661 dfirst = *d;
39662 dsecond = *d;
39663 dthird = *d;
39665 dfirst.perm[0] = (d->perm[0] & ~1);
39666 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
39667 dfirst.perm[2] = (d->perm[2] & ~1);
39668 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
39669 dsecond.perm[0] = (d->perm[1] & ~1);
39670 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
39671 dsecond.perm[2] = (d->perm[3] & ~1);
39672 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
39673 dthird.perm[0] = (d->perm[0] % 2);
39674 dthird.perm[1] = (d->perm[1] % 2) + 4;
39675 dthird.perm[2] = (d->perm[2] % 2) + 2;
39676 dthird.perm[3] = (d->perm[3] % 2) + 6;
39678 dfirst.target = gen_reg_rtx (dfirst.vmode);
39679 dsecond.target = gen_reg_rtx (dsecond.vmode);
39680 dthird.op0 = dfirst.target;
39681 dthird.op1 = dsecond.target;
39682 dthird.one_operand_p = false;
39684 canonicalize_perm (&dfirst);
39685 canonicalize_perm (&dsecond);
39687 ok = expand_vec_perm_1 (&dfirst)
39688 && expand_vec_perm_1 (&dsecond)
39689 && expand_vec_perm_1 (&dthird);
39691 gcc_assert (ok);
39693 return true;
39696 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
39697 permutation with two pshufb insns and an ior. We should have already
39698 failed all two instruction sequences. */
39700 static bool
39701 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
39703 rtx rperm[2][16], vperm, l, h, op, m128;
39704 unsigned int i, nelt, eltsz;
39706 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
39707 return false;
39708 gcc_assert (!d->one_operand_p);
39710 nelt = d->nelt;
39711 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
39713 /* Generate two permutation masks. If the required element is within
39714 the given vector it is shuffled into the proper lane. If the required
39715 element is in the other vector, force a zero into the lane by setting
39716 bit 7 in the permutation mask. */
39717 m128 = GEN_INT (-128);
39718 for (i = 0; i < nelt; ++i)
39720 unsigned j, e = d->perm[i];
39721 unsigned which = (e >= nelt);
39722 if (e >= nelt)
39723 e -= nelt;
39725 for (j = 0; j < eltsz; ++j)
39727 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
39728 rperm[1-which][i*eltsz + j] = m128;
39732 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
39733 vperm = force_reg (V16QImode, vperm);
39735 l = gen_reg_rtx (V16QImode);
39736 op = gen_lowpart (V16QImode, d->op0);
39737 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
39739 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
39740 vperm = force_reg (V16QImode, vperm);
39742 h = gen_reg_rtx (V16QImode);
39743 op = gen_lowpart (V16QImode, d->op1);
39744 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
39746 op = gen_lowpart (V16QImode, d->target);
39747 emit_insn (gen_iorv16qi3 (op, l, h));
39749 return true;
39752 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
39753 with two vpshufb insns, vpermq and vpor. We should have already failed
39754 all two or three instruction sequences. */
39756 static bool
39757 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
39759 rtx rperm[2][32], vperm, l, h, hp, op, m128;
39760 unsigned int i, nelt, eltsz;
39762 if (!TARGET_AVX2
39763 || !d->one_operand_p
39764 || (d->vmode != V32QImode && d->vmode != V16HImode))
39765 return false;
39767 if (d->testing_p)
39768 return true;
39770 nelt = d->nelt;
39771 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
39773 /* Generate two permutation masks. If the required element is within
39774 the same lane, it is shuffled in. If the required element from the
39775 other lane, force a zero by setting bit 7 in the permutation mask.
39776 In the other mask the mask has non-negative elements if element
39777 is requested from the other lane, but also moved to the other lane,
39778 so that the result of vpshufb can have the two V2TImode halves
39779 swapped. */
39780 m128 = GEN_INT (-128);
39781 for (i = 0; i < nelt; ++i)
39783 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
39784 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
39786 for (j = 0; j < eltsz; ++j)
39788 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
39789 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
39793 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
39794 vperm = force_reg (V32QImode, vperm);
39796 h = gen_reg_rtx (V32QImode);
39797 op = gen_lowpart (V32QImode, d->op0);
39798 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
39800 /* Swap the 128-byte lanes of h into hp. */
39801 hp = gen_reg_rtx (V4DImode);
39802 op = gen_lowpart (V4DImode, h);
39803 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
39804 const1_rtx));
39806 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
39807 vperm = force_reg (V32QImode, vperm);
39809 l = gen_reg_rtx (V32QImode);
39810 op = gen_lowpart (V32QImode, d->op0);
39811 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
39813 op = gen_lowpart (V32QImode, d->target);
39814 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
39816 return true;
39819 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
39820 and extract-odd permutations of two V32QImode and V16QImode operand
39821 with two vpshufb insns, vpor and vpermq. We should have already
39822 failed all two or three instruction sequences. */
39824 static bool
39825 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
39827 rtx rperm[2][32], vperm, l, h, ior, op, m128;
39828 unsigned int i, nelt, eltsz;
39830 if (!TARGET_AVX2
39831 || d->one_operand_p
39832 || (d->vmode != V32QImode && d->vmode != V16HImode))
39833 return false;
39835 for (i = 0; i < d->nelt; ++i)
39836 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
39837 return false;
39839 if (d->testing_p)
39840 return true;
39842 nelt = d->nelt;
39843 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
39845 /* Generate two permutation masks. In the first permutation mask
39846 the first quarter will contain indexes for the first half
39847 of the op0, the second quarter will contain bit 7 set, third quarter
39848 will contain indexes for the second half of the op0 and the
39849 last quarter bit 7 set. In the second permutation mask
39850 the first quarter will contain bit 7 set, the second quarter
39851 indexes for the first half of the op1, the third quarter bit 7 set
39852 and last quarter indexes for the second half of the op1.
39853 I.e. the first mask e.g. for V32QImode extract even will be:
39854 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
39855 (all values masked with 0xf except for -128) and second mask
39856 for extract even will be
39857 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
39858 m128 = GEN_INT (-128);
39859 for (i = 0; i < nelt; ++i)
39861 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
39862 unsigned which = d->perm[i] >= nelt;
39863 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
39865 for (j = 0; j < eltsz; ++j)
39867 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
39868 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
39872 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
39873 vperm = force_reg (V32QImode, vperm);
39875 l = gen_reg_rtx (V32QImode);
39876 op = gen_lowpart (V32QImode, d->op0);
39877 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
39879 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
39880 vperm = force_reg (V32QImode, vperm);
39882 h = gen_reg_rtx (V32QImode);
39883 op = gen_lowpart (V32QImode, d->op1);
39884 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
39886 ior = gen_reg_rtx (V32QImode);
39887 emit_insn (gen_iorv32qi3 (ior, l, h));
39889 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
39890 op = gen_lowpart (V4DImode, d->target);
39891 ior = gen_lowpart (V4DImode, ior);
39892 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
39893 const1_rtx, GEN_INT (3)));
39895 return true;
39898 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
39899 and extract-odd permutations. */
39901 static bool
39902 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
39904 rtx t1, t2, t3;
39906 switch (d->vmode)
39908 case V4DFmode:
39909 t1 = gen_reg_rtx (V4DFmode);
39910 t2 = gen_reg_rtx (V4DFmode);
39912 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
39913 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
39914 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
39916 /* Now an unpck[lh]pd will produce the result required. */
39917 if (odd)
39918 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
39919 else
39920 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
39921 emit_insn (t3);
39922 break;
39924 case V8SFmode:
39926 int mask = odd ? 0xdd : 0x88;
39928 t1 = gen_reg_rtx (V8SFmode);
39929 t2 = gen_reg_rtx (V8SFmode);
39930 t3 = gen_reg_rtx (V8SFmode);
39932 /* Shuffle within the 128-bit lanes to produce:
39933 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
39934 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
39935 GEN_INT (mask)));
39937 /* Shuffle the lanes around to produce:
39938 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
39939 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
39940 GEN_INT (0x3)));
39942 /* Shuffle within the 128-bit lanes to produce:
39943 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
39944 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
39946 /* Shuffle within the 128-bit lanes to produce:
39947 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
39948 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
39950 /* Shuffle the lanes around to produce:
39951 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
39952 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
39953 GEN_INT (0x20)));
39955 break;
39957 case V2DFmode:
39958 case V4SFmode:
39959 case V2DImode:
39960 case V4SImode:
39961 /* These are always directly implementable by expand_vec_perm_1. */
39962 gcc_unreachable ();
39964 case V8HImode:
39965 if (TARGET_SSSE3)
39966 return expand_vec_perm_pshufb2 (d);
39967 else
39969 /* We need 2*log2(N)-1 operations to achieve odd/even
39970 with interleave. */
39971 t1 = gen_reg_rtx (V8HImode);
39972 t2 = gen_reg_rtx (V8HImode);
39973 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
39974 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
39975 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
39976 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
39977 if (odd)
39978 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
39979 else
39980 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
39981 emit_insn (t3);
39983 break;
39985 case V16QImode:
39986 if (TARGET_SSSE3)
39987 return expand_vec_perm_pshufb2 (d);
39988 else
39990 t1 = gen_reg_rtx (V16QImode);
39991 t2 = gen_reg_rtx (V16QImode);
39992 t3 = gen_reg_rtx (V16QImode);
39993 emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
39994 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
39995 emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
39996 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
39997 emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
39998 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
39999 if (odd)
40000 t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
40001 else
40002 t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
40003 emit_insn (t3);
40005 break;
40007 case V16HImode:
40008 case V32QImode:
40009 return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
40011 case V4DImode:
40012 if (!TARGET_AVX2)
40014 struct expand_vec_perm_d d_copy = *d;
40015 d_copy.vmode = V4DFmode;
40016 d_copy.target = gen_lowpart (V4DFmode, d->target);
40017 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
40018 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
40019 return expand_vec_perm_even_odd_1 (&d_copy, odd);
40022 t1 = gen_reg_rtx (V4DImode);
40023 t2 = gen_reg_rtx (V4DImode);
40025 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
40026 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
40027 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
40029 /* Now an vpunpck[lh]qdq will produce the result required. */
40030 if (odd)
40031 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
40032 else
40033 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
40034 emit_insn (t3);
40035 break;
40037 case V8SImode:
40038 if (!TARGET_AVX2)
40040 struct expand_vec_perm_d d_copy = *d;
40041 d_copy.vmode = V8SFmode;
40042 d_copy.target = gen_lowpart (V8SFmode, d->target);
40043 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
40044 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
40045 return expand_vec_perm_even_odd_1 (&d_copy, odd);
40048 t1 = gen_reg_rtx (V8SImode);
40049 t2 = gen_reg_rtx (V8SImode);
40051 /* Shuffle the lanes around into
40052 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
40053 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1),
40054 gen_lowpart (V4DImode, d->op0),
40055 gen_lowpart (V4DImode, d->op1),
40056 GEN_INT (0x20)));
40057 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2),
40058 gen_lowpart (V4DImode, d->op0),
40059 gen_lowpart (V4DImode, d->op1),
40060 GEN_INT (0x31)));
40062 /* Swap the 2nd and 3rd position in each lane into
40063 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
40064 emit_insn (gen_avx2_pshufdv3 (t1, t1,
40065 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
40066 emit_insn (gen_avx2_pshufdv3 (t2, t2,
40067 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
40069 /* Now an vpunpck[lh]qdq will produce
40070 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
40071 if (odd)
40072 t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target),
40073 gen_lowpart (V4DImode, t1),
40074 gen_lowpart (V4DImode, t2));
40075 else
40076 t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target),
40077 gen_lowpart (V4DImode, t1),
40078 gen_lowpart (V4DImode, t2));
40079 emit_insn (t3);
40080 break;
40082 default:
40083 gcc_unreachable ();
40086 return true;
40089 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
40090 extract-even and extract-odd permutations. */
40092 static bool
40093 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
40095 unsigned i, odd, nelt = d->nelt;
40097 odd = d->perm[0];
40098 if (odd != 0 && odd != 1)
40099 return false;
40101 for (i = 1; i < nelt; ++i)
40102 if (d->perm[i] != 2 * i + odd)
40103 return false;
40105 return expand_vec_perm_even_odd_1 (d, odd);
40108 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
40109 permutations. We assume that expand_vec_perm_1 has already failed. */
40111 static bool
40112 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
40114 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
40115 enum machine_mode vmode = d->vmode;
40116 unsigned char perm2[4];
40117 rtx op0 = d->op0;
40118 bool ok;
40120 switch (vmode)
40122 case V4DFmode:
40123 case V8SFmode:
40124 /* These are special-cased in sse.md so that we can optionally
40125 use the vbroadcast instruction. They expand to two insns
40126 if the input happens to be in a register. */
40127 gcc_unreachable ();
40129 case V2DFmode:
40130 case V2DImode:
40131 case V4SFmode:
40132 case V4SImode:
40133 /* These are always implementable using standard shuffle patterns. */
40134 gcc_unreachable ();
40136 case V8HImode:
40137 case V16QImode:
40138 /* These can be implemented via interleave. We save one insn by
40139 stopping once we have promoted to V4SImode and then use pshufd. */
40142 rtx dest;
40143 rtx (*gen) (rtx, rtx, rtx)
40144 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
40145 : gen_vec_interleave_lowv8hi;
40147 if (elt >= nelt2)
40149 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
40150 : gen_vec_interleave_highv8hi;
40151 elt -= nelt2;
40153 nelt2 /= 2;
40155 dest = gen_reg_rtx (vmode);
40156 emit_insn (gen (dest, op0, op0));
40157 vmode = get_mode_wider_vector (vmode);
40158 op0 = gen_lowpart (vmode, dest);
40160 while (vmode != V4SImode);
40162 memset (perm2, elt, 4);
40163 ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4,
40164 d->testing_p);
40165 gcc_assert (ok);
40166 return true;
40168 case V32QImode:
40169 case V16HImode:
40170 case V8SImode:
40171 case V4DImode:
40172 /* For AVX2 broadcasts of the first element vpbroadcast* or
40173 vpermq should be used by expand_vec_perm_1. */
40174 gcc_assert (!TARGET_AVX2 || d->perm[0]);
40175 return false;
40177 default:
40178 gcc_unreachable ();
40182 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
40183 broadcast permutations. */
40185 static bool
40186 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
40188 unsigned i, elt, nelt = d->nelt;
40190 if (!d->one_operand_p)
40191 return false;
40193 elt = d->perm[0];
40194 for (i = 1; i < nelt; ++i)
40195 if (d->perm[i] != elt)
40196 return false;
40198 return expand_vec_perm_broadcast_1 (d);
40201 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
40202 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
40203 all the shorter instruction sequences. */
40205 static bool
40206 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
40208 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
40209 unsigned int i, nelt, eltsz;
40210 bool used[4];
40212 if (!TARGET_AVX2
40213 || d->one_operand_p
40214 || (d->vmode != V32QImode && d->vmode != V16HImode))
40215 return false;
40217 if (d->testing_p)
40218 return true;
40220 nelt = d->nelt;
40221 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
40223 /* Generate 4 permutation masks. If the required element is within
40224 the same lane, it is shuffled in. If the required element from the
40225 other lane, force a zero by setting bit 7 in the permutation mask.
40226 In the other mask the mask has non-negative elements if element
40227 is requested from the other lane, but also moved to the other lane,
40228 so that the result of vpshufb can have the two V2TImode halves
40229 swapped. */
40230 m128 = GEN_INT (-128);
40231 for (i = 0; i < 32; ++i)
40233 rperm[0][i] = m128;
40234 rperm[1][i] = m128;
40235 rperm[2][i] = m128;
40236 rperm[3][i] = m128;
40238 used[0] = false;
40239 used[1] = false;
40240 used[2] = false;
40241 used[3] = false;
40242 for (i = 0; i < nelt; ++i)
40244 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
40245 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
40246 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
40248 for (j = 0; j < eltsz; ++j)
40249 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
40250 used[which] = true;
40253 for (i = 0; i < 2; ++i)
40255 if (!used[2 * i + 1])
40257 h[i] = NULL_RTX;
40258 continue;
40260 vperm = gen_rtx_CONST_VECTOR (V32QImode,
40261 gen_rtvec_v (32, rperm[2 * i + 1]));
40262 vperm = force_reg (V32QImode, vperm);
40263 h[i] = gen_reg_rtx (V32QImode);
40264 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
40265 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
40268 /* Swap the 128-byte lanes of h[X]. */
40269 for (i = 0; i < 2; ++i)
40271 if (h[i] == NULL_RTX)
40272 continue;
40273 op = gen_reg_rtx (V4DImode);
40274 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
40275 const2_rtx, GEN_INT (3), const0_rtx,
40276 const1_rtx));
40277 h[i] = gen_lowpart (V32QImode, op);
40280 for (i = 0; i < 2; ++i)
40282 if (!used[2 * i])
40284 l[i] = NULL_RTX;
40285 continue;
40287 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
40288 vperm = force_reg (V32QImode, vperm);
40289 l[i] = gen_reg_rtx (V32QImode);
40290 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
40291 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
40294 for (i = 0; i < 2; ++i)
40296 if (h[i] && l[i])
40298 op = gen_reg_rtx (V32QImode);
40299 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
40300 l[i] = op;
40302 else if (h[i])
40303 l[i] = h[i];
40306 gcc_assert (l[0] && l[1]);
40307 op = gen_lowpart (V32QImode, d->target);
40308 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
40309 return true;
40312 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
40313 With all of the interface bits taken care of, perform the expansion
40314 in D and return true on success. */
40316 static bool
40317 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
40319 /* Try a single instruction expansion. */
40320 if (expand_vec_perm_1 (d))
40321 return true;
40323 /* Try sequences of two instructions. */
40325 if (expand_vec_perm_pshuflw_pshufhw (d))
40326 return true;
40328 if (expand_vec_perm_palignr (d))
40329 return true;
40331 if (expand_vec_perm_interleave2 (d))
40332 return true;
40334 if (expand_vec_perm_broadcast (d))
40335 return true;
40337 if (expand_vec_perm_vpermq_perm_1 (d))
40338 return true;
40340 if (expand_vec_perm_vperm2f128 (d))
40341 return true;
40343 /* Try sequences of three instructions. */
40345 if (expand_vec_perm_2vperm2f128_vshuf (d))
40346 return true;
40348 if (expand_vec_perm_pshufb2 (d))
40349 return true;
40351 if (expand_vec_perm_interleave3 (d))
40352 return true;
40354 if (expand_vec_perm_vperm2f128_vblend (d))
40355 return true;
40357 /* Try sequences of four instructions. */
40359 if (expand_vec_perm_vpshufb2_vpermq (d))
40360 return true;
40362 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
40363 return true;
40365 /* ??? Look for narrow permutations whose element orderings would
40366 allow the promotion to a wider mode. */
40368 /* ??? Look for sequences of interleave or a wider permute that place
40369 the data into the correct lanes for a half-vector shuffle like
40370 pshuf[lh]w or vpermilps. */
40372 /* ??? Look for sequences of interleave that produce the desired results.
40373 The combinatorics of punpck[lh] get pretty ugly... */
40375 if (expand_vec_perm_even_odd (d))
40376 return true;
40378 /* Even longer sequences. */
40379 if (expand_vec_perm_vpshufb4_vpermq2 (d))
40380 return true;
40382 return false;
40385 /* If a permutation only uses one operand, make it clear. Returns true
40386 if the permutation references both operands. */
40388 static bool
40389 canonicalize_perm (struct expand_vec_perm_d *d)
40391 int i, which, nelt = d->nelt;
40393 for (i = which = 0; i < nelt; ++i)
40394 which |= (d->perm[i] < nelt ? 1 : 2);
40396 d->one_operand_p = true;
40397 switch (which)
40399 default:
40400 gcc_unreachable();
40402 case 3:
40403 if (!rtx_equal_p (d->op0, d->op1))
40405 d->one_operand_p = false;
40406 break;
40408 /* The elements of PERM do not suggest that only the first operand
40409 is used, but both operands are identical. Allow easier matching
40410 of the permutation by folding the permutation into the single
40411 input vector. */
40412 /* FALLTHRU */
40414 case 2:
40415 for (i = 0; i < nelt; ++i)
40416 d->perm[i] &= nelt - 1;
40417 d->op0 = d->op1;
40418 break;
40420 case 1:
40421 d->op1 = d->op0;
40422 break;
40425 return (which == 3);
40428 bool
40429 ix86_expand_vec_perm_const (rtx operands[4])
40431 struct expand_vec_perm_d d;
40432 unsigned char perm[MAX_VECT_LEN];
40433 int i, nelt;
40434 bool two_args;
40435 rtx sel;
40437 d.target = operands[0];
40438 d.op0 = operands[1];
40439 d.op1 = operands[2];
40440 sel = operands[3];
40442 d.vmode = GET_MODE (d.target);
40443 gcc_assert (VECTOR_MODE_P (d.vmode));
40444 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40445 d.testing_p = false;
40447 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
40448 gcc_assert (XVECLEN (sel, 0) == nelt);
40449 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
40451 for (i = 0; i < nelt; ++i)
40453 rtx e = XVECEXP (sel, 0, i);
40454 int ei = INTVAL (e) & (2 * nelt - 1);
40455 d.perm[i] = ei;
40456 perm[i] = ei;
40459 two_args = canonicalize_perm (&d);
40461 if (ix86_expand_vec_perm_const_1 (&d))
40462 return true;
40464 /* If the selector says both arguments are needed, but the operands are the
40465 same, the above tried to expand with one_operand_p and flattened selector.
40466 If that didn't work, retry without one_operand_p; we succeeded with that
40467 during testing. */
40468 if (two_args && d.one_operand_p)
40470 d.one_operand_p = false;
40471 memcpy (d.perm, perm, sizeof (perm));
40472 return ix86_expand_vec_perm_const_1 (&d);
40475 return false;
40478 /* Implement targetm.vectorize.vec_perm_const_ok. */
40480 static bool
40481 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
40482 const unsigned char *sel)
40484 struct expand_vec_perm_d d;
40485 unsigned int i, nelt, which;
40486 bool ret;
40488 d.vmode = vmode;
40489 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40490 d.testing_p = true;
40492 /* Given sufficient ISA support we can just return true here
40493 for selected vector modes. */
40494 if (GET_MODE_SIZE (d.vmode) == 16)
40496 /* All implementable with a single vpperm insn. */
40497 if (TARGET_XOP)
40498 return true;
40499 /* All implementable with 2 pshufb + 1 ior. */
40500 if (TARGET_SSSE3)
40501 return true;
40502 /* All implementable with shufpd or unpck[lh]pd. */
40503 if (d.nelt == 2)
40504 return true;
40507 /* Extract the values from the vector CST into the permutation
40508 array in D. */
40509 memcpy (d.perm, sel, nelt);
40510 for (i = which = 0; i < nelt; ++i)
40512 unsigned char e = d.perm[i];
40513 gcc_assert (e < 2 * nelt);
40514 which |= (e < nelt ? 1 : 2);
40517 /* For all elements from second vector, fold the elements to first. */
40518 if (which == 2)
40519 for (i = 0; i < nelt; ++i)
40520 d.perm[i] -= nelt;
40522 /* Check whether the mask can be applied to the vector type. */
40523 d.one_operand_p = (which != 3);
40525 /* Implementable with shufps or pshufd. */
40526 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
40527 return true;
40529 /* Otherwise we have to go through the motions and see if we can
40530 figure out how to generate the requested permutation. */
40531 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
40532 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
40533 if (!d.one_operand_p)
40534 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
40536 start_sequence ();
40537 ret = ix86_expand_vec_perm_const_1 (&d);
40538 end_sequence ();
40540 return ret;
40543 void
40544 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
40546 struct expand_vec_perm_d d;
40547 unsigned i, nelt;
40549 d.target = targ;
40550 d.op0 = op0;
40551 d.op1 = op1;
40552 d.vmode = GET_MODE (targ);
40553 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40554 d.one_operand_p = false;
40555 d.testing_p = false;
40557 for (i = 0; i < nelt; ++i)
40558 d.perm[i] = i * 2 + odd;
40560 /* We'll either be able to implement the permutation directly... */
40561 if (expand_vec_perm_1 (&d))
40562 return;
40564 /* ... or we use the special-case patterns. */
40565 expand_vec_perm_even_odd_1 (&d, odd);
40568 static void
40569 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
40571 struct expand_vec_perm_d d;
40572 unsigned i, nelt, base;
40573 bool ok;
40575 d.target = targ;
40576 d.op0 = op0;
40577 d.op1 = op1;
40578 d.vmode = GET_MODE (targ);
40579 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40580 d.one_operand_p = false;
40581 d.testing_p = false;
40583 base = high_p ? nelt / 2 : 0;
40584 for (i = 0; i < nelt / 2; ++i)
40586 d.perm[i * 2] = i + base;
40587 d.perm[i * 2 + 1] = i + base + nelt;
40590 /* Note that for AVX this isn't one instruction. */
40591 ok = ix86_expand_vec_perm_const_1 (&d);
40592 gcc_assert (ok);
40596 /* Expand a vector operation CODE for a V*QImode in terms of the
40597 same operation on V*HImode. */
40599 void
40600 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
40602 enum machine_mode qimode = GET_MODE (dest);
40603 enum machine_mode himode;
40604 rtx (*gen_il) (rtx, rtx, rtx);
40605 rtx (*gen_ih) (rtx, rtx, rtx);
40606 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
40607 struct expand_vec_perm_d d;
40608 bool ok, full_interleave;
40609 bool uns_p = false;
40610 int i;
40612 switch (qimode)
40614 case V16QImode:
40615 himode = V8HImode;
40616 gen_il = gen_vec_interleave_lowv16qi;
40617 gen_ih = gen_vec_interleave_highv16qi;
40618 break;
40619 case V32QImode:
40620 himode = V16HImode;
40621 gen_il = gen_avx2_interleave_lowv32qi;
40622 gen_ih = gen_avx2_interleave_highv32qi;
40623 break;
40624 default:
40625 gcc_unreachable ();
40628 op2_l = op2_h = op2;
40629 switch (code)
40631 case MULT:
40632 /* Unpack data such that we've got a source byte in each low byte of
40633 each word. We don't care what goes into the high byte of each word.
40634 Rather than trying to get zero in there, most convenient is to let
40635 it be a copy of the low byte. */
40636 op2_l = gen_reg_rtx (qimode);
40637 op2_h = gen_reg_rtx (qimode);
40638 emit_insn (gen_il (op2_l, op2, op2));
40639 emit_insn (gen_ih (op2_h, op2, op2));
40640 /* FALLTHRU */
40642 op1_l = gen_reg_rtx (qimode);
40643 op1_h = gen_reg_rtx (qimode);
40644 emit_insn (gen_il (op1_l, op1, op1));
40645 emit_insn (gen_ih (op1_h, op1, op1));
40646 full_interleave = qimode == V16QImode;
40647 break;
40649 case ASHIFT:
40650 case LSHIFTRT:
40651 uns_p = true;
40652 /* FALLTHRU */
40653 case ASHIFTRT:
40654 op1_l = gen_reg_rtx (himode);
40655 op1_h = gen_reg_rtx (himode);
40656 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
40657 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
40658 full_interleave = true;
40659 break;
40660 default:
40661 gcc_unreachable ();
40664 /* Perform the operation. */
40665 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
40666 1, OPTAB_DIRECT);
40667 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
40668 1, OPTAB_DIRECT);
40669 gcc_assert (res_l && res_h);
40671 /* Merge the data back into the right place. */
40672 d.target = dest;
40673 d.op0 = gen_lowpart (qimode, res_l);
40674 d.op1 = gen_lowpart (qimode, res_h);
40675 d.vmode = qimode;
40676 d.nelt = GET_MODE_NUNITS (qimode);
40677 d.one_operand_p = false;
40678 d.testing_p = false;
40680 if (full_interleave)
40682 /* For SSE2, we used an full interleave, so the desired
40683 results are in the even elements. */
40684 for (i = 0; i < 32; ++i)
40685 d.perm[i] = i * 2;
40687 else
40689 /* For AVX, the interleave used above was not cross-lane. So the
40690 extraction is evens but with the second and third quarter swapped.
40691 Happily, that is even one insn shorter than even extraction. */
40692 for (i = 0; i < 32; ++i)
40693 d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
40696 ok = ix86_expand_vec_perm_const_1 (&d);
40697 gcc_assert (ok);
40699 set_unique_reg_note (get_last_insn (), REG_EQUAL,
40700 gen_rtx_fmt_ee (code, qimode, op1, op2));
40703 void
40704 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
40705 bool uns_p, bool odd_p)
40707 enum machine_mode mode = GET_MODE (op1);
40708 enum machine_mode wmode = GET_MODE (dest);
40709 rtx x;
40711 /* We only play even/odd games with vectors of SImode. */
40712 gcc_assert (mode == V4SImode || mode == V8SImode);
40714 /* If we're looking for the odd results, shift those members down to
40715 the even slots. For some cpus this is faster than a PSHUFD. */
40716 if (odd_p)
40718 if (TARGET_XOP && mode == V4SImode)
40720 x = force_reg (wmode, CONST0_RTX (wmode));
40721 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
40722 return;
40725 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
40726 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
40727 x, NULL, 1, OPTAB_DIRECT);
40728 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
40729 x, NULL, 1, OPTAB_DIRECT);
40730 op1 = gen_lowpart (mode, op1);
40731 op2 = gen_lowpart (mode, op2);
40734 if (mode == V8SImode)
40736 if (uns_p)
40737 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
40738 else
40739 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
40741 else if (uns_p)
40742 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
40743 else if (TARGET_SSE4_1)
40744 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
40745 else
40747 rtx s1, s2, t0, t1, t2;
40749 /* The easiest way to implement this without PMULDQ is to go through
40750 the motions as if we are performing a full 64-bit multiply. With
40751 the exception that we need to do less shuffling of the elements. */
40753 /* Compute the sign-extension, aka highparts, of the two operands. */
40754 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
40755 op1, pc_rtx, pc_rtx);
40756 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
40757 op2, pc_rtx, pc_rtx);
40759 /* Multiply LO(A) * HI(B), and vice-versa. */
40760 t1 = gen_reg_rtx (wmode);
40761 t2 = gen_reg_rtx (wmode);
40762 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
40763 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
40765 /* Multiply LO(A) * LO(B). */
40766 t0 = gen_reg_rtx (wmode);
40767 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
40769 /* Combine and shift the highparts into place. */
40770 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
40771 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
40772 1, OPTAB_DIRECT);
40774 /* Combine high and low parts. */
40775 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
40776 return;
40778 emit_insn (x);
40781 void
40782 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
40783 bool uns_p, bool high_p)
40785 enum machine_mode wmode = GET_MODE (dest);
40786 enum machine_mode mode = GET_MODE (op1);
40787 rtx t1, t2, t3, t4, mask;
40789 switch (mode)
40791 case V4SImode:
40792 t1 = gen_reg_rtx (mode);
40793 t2 = gen_reg_rtx (mode);
40794 if (TARGET_XOP && !uns_p)
40796 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
40797 shuffle the elements once so that all elements are in the right
40798 place for immediate use: { A C B D }. */
40799 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
40800 const1_rtx, GEN_INT (3)));
40801 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
40802 const1_rtx, GEN_INT (3)));
40804 else
40806 /* Put the elements into place for the multiply. */
40807 ix86_expand_vec_interleave (t1, op1, op1, high_p);
40808 ix86_expand_vec_interleave (t2, op2, op2, high_p);
40809 high_p = false;
40811 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
40812 break;
40814 case V8SImode:
40815 /* Shuffle the elements between the lanes. After this we
40816 have { A B E F | C D G H } for each operand. */
40817 t1 = gen_reg_rtx (V4DImode);
40818 t2 = gen_reg_rtx (V4DImode);
40819 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
40820 const0_rtx, const2_rtx,
40821 const1_rtx, GEN_INT (3)));
40822 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
40823 const0_rtx, const2_rtx,
40824 const1_rtx, GEN_INT (3)));
40826 /* Shuffle the elements within the lanes. After this we
40827 have { A A B B | C C D D } or { E E F F | G G H H }. */
40828 t3 = gen_reg_rtx (V8SImode);
40829 t4 = gen_reg_rtx (V8SImode);
40830 mask = GEN_INT (high_p
40831 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
40832 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
40833 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
40834 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
40836 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
40837 break;
40839 case V8HImode:
40840 case V16HImode:
40841 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
40842 uns_p, OPTAB_DIRECT);
40843 t2 = expand_binop (mode,
40844 uns_p ? umul_highpart_optab : smul_highpart_optab,
40845 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
40846 gcc_assert (t1 && t2);
40848 ix86_expand_vec_interleave (gen_lowpart (mode, dest), t1, t2, high_p);
40849 break;
40851 case V16QImode:
40852 case V32QImode:
40853 t1 = gen_reg_rtx (wmode);
40854 t2 = gen_reg_rtx (wmode);
40855 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
40856 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
40858 emit_insn (gen_rtx_SET (VOIDmode, dest, gen_rtx_MULT (wmode, t1, t2)));
40859 break;
40861 default:
40862 gcc_unreachable ();
40866 void
40867 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
40869 rtx res_1, res_2;
40871 res_1 = gen_reg_rtx (V4SImode);
40872 res_2 = gen_reg_rtx (V4SImode);
40873 ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_1),
40874 op1, op2, true, false);
40875 ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_2),
40876 op1, op2, true, true);
40878 /* Move the results in element 2 down to element 1; we don't care
40879 what goes in elements 2 and 3. Then we can merge the parts
40880 back together with an interleave.
40882 Note that two other sequences were tried:
40883 (1) Use interleaves at the start instead of psrldq, which allows
40884 us to use a single shufps to merge things back at the end.
40885 (2) Use shufps here to combine the two vectors, then pshufd to
40886 put the elements in the correct order.
40887 In both cases the cost of the reformatting stall was too high
40888 and the overall sequence slower. */
40890 emit_insn (gen_sse2_pshufd_1 (res_1, res_1, const0_rtx, const2_rtx,
40891 const0_rtx, const0_rtx));
40892 emit_insn (gen_sse2_pshufd_1 (res_2, res_2, const0_rtx, const2_rtx,
40893 const0_rtx, const0_rtx));
40894 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
40896 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
40899 void
40900 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
40902 enum machine_mode mode = GET_MODE (op0);
40903 rtx t1, t2, t3, t4, t5, t6;
40905 if (TARGET_XOP && mode == V2DImode)
40907 /* op1: A,B,C,D, op2: E,F,G,H */
40908 op1 = gen_lowpart (V4SImode, op1);
40909 op2 = gen_lowpart (V4SImode, op2);
40911 t1 = gen_reg_rtx (V4SImode);
40912 t2 = gen_reg_rtx (V4SImode);
40913 t3 = gen_reg_rtx (V2DImode);
40914 t4 = gen_reg_rtx (V2DImode);
40916 /* t1: B,A,D,C */
40917 emit_insn (gen_sse2_pshufd_1 (t1, op1,
40918 GEN_INT (1),
40919 GEN_INT (0),
40920 GEN_INT (3),
40921 GEN_INT (2)));
40923 /* t2: (B*E),(A*F),(D*G),(C*H) */
40924 emit_insn (gen_mulv4si3 (t2, t1, op2));
40926 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
40927 emit_insn (gen_xop_phadddq (t3, t2));
40929 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
40930 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
40932 /* op0: (((B*E)+(A*F))<<32)+(B*F), (((D*G)+(C*H))<<32)+(D*H) */
40933 emit_insn (gen_xop_pmacsdql (op0, op1, op2, t4));
40935 else
40937 enum machine_mode nmode;
40938 rtx (*umul) (rtx, rtx, rtx);
40940 if (mode == V2DImode)
40942 umul = gen_vec_widen_umult_even_v4si;
40943 nmode = V4SImode;
40945 else if (mode == V4DImode)
40947 umul = gen_vec_widen_umult_even_v8si;
40948 nmode = V8SImode;
40950 else
40951 gcc_unreachable ();
40954 /* Multiply low parts. */
40955 t1 = gen_reg_rtx (mode);
40956 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
40958 /* Shift input vectors right 32 bits so we can multiply high parts. */
40959 t6 = GEN_INT (32);
40960 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
40961 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
40963 /* Multiply high parts by low parts. */
40964 t4 = gen_reg_rtx (mode);
40965 t5 = gen_reg_rtx (mode);
40966 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
40967 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
40969 /* Combine and shift the highparts back. */
40970 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
40971 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
40973 /* Combine high and low parts. */
40974 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
40977 set_unique_reg_note (get_last_insn (), REG_EQUAL,
40978 gen_rtx_MULT (mode, op1, op2));
40981 /* Expand an insert into a vector register through pinsr insn.
40982 Return true if successful. */
40984 bool
40985 ix86_expand_pinsr (rtx *operands)
40987 rtx dst = operands[0];
40988 rtx src = operands[3];
40990 unsigned int size = INTVAL (operands[1]);
40991 unsigned int pos = INTVAL (operands[2]);
40993 if (GET_CODE (dst) == SUBREG)
40995 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
40996 dst = SUBREG_REG (dst);
40999 if (GET_CODE (src) == SUBREG)
41000 src = SUBREG_REG (src);
41002 switch (GET_MODE (dst))
41004 case V16QImode:
41005 case V8HImode:
41006 case V4SImode:
41007 case V2DImode:
41009 enum machine_mode srcmode, dstmode;
41010 rtx (*pinsr)(rtx, rtx, rtx, rtx);
41012 srcmode = mode_for_size (size, MODE_INT, 0);
41014 switch (srcmode)
41016 case QImode:
41017 if (!TARGET_SSE4_1)
41018 return false;
41019 dstmode = V16QImode;
41020 pinsr = gen_sse4_1_pinsrb;
41021 break;
41023 case HImode:
41024 if (!TARGET_SSE2)
41025 return false;
41026 dstmode = V8HImode;
41027 pinsr = gen_sse2_pinsrw;
41028 break;
41030 case SImode:
41031 if (!TARGET_SSE4_1)
41032 return false;
41033 dstmode = V4SImode;
41034 pinsr = gen_sse4_1_pinsrd;
41035 break;
41037 case DImode:
41038 gcc_assert (TARGET_64BIT);
41039 if (!TARGET_SSE4_1)
41040 return false;
41041 dstmode = V2DImode;
41042 pinsr = gen_sse4_1_pinsrq;
41043 break;
41045 default:
41046 return false;
41049 dst = gen_lowpart (dstmode, dst);
41050 src = gen_lowpart (srcmode, src);
41052 pos /= size;
41054 emit_insn (pinsr (dst, dst, src, GEN_INT (1 << pos)));
41055 return true;
41058 default:
41059 return false;
41063 /* This function returns the calling abi specific va_list type node.
41064 It returns the FNDECL specific va_list type. */
41066 static tree
41067 ix86_fn_abi_va_list (tree fndecl)
41069 if (!TARGET_64BIT)
41070 return va_list_type_node;
41071 gcc_assert (fndecl != NULL_TREE);
41073 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
41074 return ms_va_list_type_node;
41075 else
41076 return sysv_va_list_type_node;
41079 /* Returns the canonical va_list type specified by TYPE. If there
41080 is no valid TYPE provided, it return NULL_TREE. */
41082 static tree
41083 ix86_canonical_va_list_type (tree type)
41085 tree wtype, htype;
41087 /* Resolve references and pointers to va_list type. */
41088 if (TREE_CODE (type) == MEM_REF)
41089 type = TREE_TYPE (type);
41090 else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
41091 type = TREE_TYPE (type);
41092 else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
41093 type = TREE_TYPE (type);
41095 if (TARGET_64BIT && va_list_type_node != NULL_TREE)
41097 wtype = va_list_type_node;
41098 gcc_assert (wtype != NULL_TREE);
41099 htype = type;
41100 if (TREE_CODE (wtype) == ARRAY_TYPE)
41102 /* If va_list is an array type, the argument may have decayed
41103 to a pointer type, e.g. by being passed to another function.
41104 In that case, unwrap both types so that we can compare the
41105 underlying records. */
41106 if (TREE_CODE (htype) == ARRAY_TYPE
41107 || POINTER_TYPE_P (htype))
41109 wtype = TREE_TYPE (wtype);
41110 htype = TREE_TYPE (htype);
41113 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
41114 return va_list_type_node;
41115 wtype = sysv_va_list_type_node;
41116 gcc_assert (wtype != NULL_TREE);
41117 htype = type;
41118 if (TREE_CODE (wtype) == ARRAY_TYPE)
41120 /* If va_list is an array type, the argument may have decayed
41121 to a pointer type, e.g. by being passed to another function.
41122 In that case, unwrap both types so that we can compare the
41123 underlying records. */
41124 if (TREE_CODE (htype) == ARRAY_TYPE
41125 || POINTER_TYPE_P (htype))
41127 wtype = TREE_TYPE (wtype);
41128 htype = TREE_TYPE (htype);
41131 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
41132 return sysv_va_list_type_node;
41133 wtype = ms_va_list_type_node;
41134 gcc_assert (wtype != NULL_TREE);
41135 htype = type;
41136 if (TREE_CODE (wtype) == ARRAY_TYPE)
41138 /* If va_list is an array type, the argument may have decayed
41139 to a pointer type, e.g. by being passed to another function.
41140 In that case, unwrap both types so that we can compare the
41141 underlying records. */
41142 if (TREE_CODE (htype) == ARRAY_TYPE
41143 || POINTER_TYPE_P (htype))
41145 wtype = TREE_TYPE (wtype);
41146 htype = TREE_TYPE (htype);
41149 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
41150 return ms_va_list_type_node;
41151 return NULL_TREE;
41153 return std_canonical_va_list_type (type);
41156 /* Iterate through the target-specific builtin types for va_list.
41157 IDX denotes the iterator, *PTREE is set to the result type of
41158 the va_list builtin, and *PNAME to its internal type.
41159 Returns zero if there is no element for this index, otherwise
41160 IDX should be increased upon the next call.
41161 Note, do not iterate a base builtin's name like __builtin_va_list.
41162 Used from c_common_nodes_and_builtins. */
41164 static int
41165 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
41167 if (TARGET_64BIT)
41169 switch (idx)
41171 default:
41172 break;
41174 case 0:
41175 *ptree = ms_va_list_type_node;
41176 *pname = "__builtin_ms_va_list";
41177 return 1;
41179 case 1:
41180 *ptree = sysv_va_list_type_node;
41181 *pname = "__builtin_sysv_va_list";
41182 return 1;
41186 return 0;
41189 #undef TARGET_SCHED_DISPATCH
41190 #define TARGET_SCHED_DISPATCH has_dispatch
41191 #undef TARGET_SCHED_DISPATCH_DO
41192 #define TARGET_SCHED_DISPATCH_DO do_dispatch
41193 #undef TARGET_SCHED_REASSOCIATION_WIDTH
41194 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
41195 #undef TARGET_SCHED_REORDER
41196 #define TARGET_SCHED_REORDER ix86_sched_reorder
41197 #undef TARGET_SCHED_ADJUST_PRIORITY
41198 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
41199 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
41200 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK ix86_dependencies_evaluation_hook
41202 /* The size of the dispatch window is the total number of bytes of
41203 object code allowed in a window. */
41204 #define DISPATCH_WINDOW_SIZE 16
41206 /* Number of dispatch windows considered for scheduling. */
41207 #define MAX_DISPATCH_WINDOWS 3
41209 /* Maximum number of instructions in a window. */
41210 #define MAX_INSN 4
41212 /* Maximum number of immediate operands in a window. */
41213 #define MAX_IMM 4
41215 /* Maximum number of immediate bits allowed in a window. */
41216 #define MAX_IMM_SIZE 128
41218 /* Maximum number of 32 bit immediates allowed in a window. */
41219 #define MAX_IMM_32 4
41221 /* Maximum number of 64 bit immediates allowed in a window. */
41222 #define MAX_IMM_64 2
41224 /* Maximum total of loads or prefetches allowed in a window. */
41225 #define MAX_LOAD 2
41227 /* Maximum total of stores allowed in a window. */
41228 #define MAX_STORE 1
41230 #undef BIG
41231 #define BIG 100
41234 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
41235 enum dispatch_group {
41236 disp_no_group = 0,
41237 disp_load,
41238 disp_store,
41239 disp_load_store,
41240 disp_prefetch,
41241 disp_imm,
41242 disp_imm_32,
41243 disp_imm_64,
41244 disp_branch,
41245 disp_cmp,
41246 disp_jcc,
41247 disp_last
41250 /* Number of allowable groups in a dispatch window. It is an array
41251 indexed by dispatch_group enum. 100 is used as a big number,
41252 because the number of these kind of operations does not have any
41253 effect in dispatch window, but we need them for other reasons in
41254 the table. */
41255 static unsigned int num_allowable_groups[disp_last] = {
41256 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
41259 char group_name[disp_last + 1][16] = {
41260 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
41261 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
41262 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
41265 /* Instruction path. */
41266 enum insn_path {
41267 no_path = 0,
41268 path_single, /* Single micro op. */
41269 path_double, /* Double micro op. */
41270 path_multi, /* Instructions with more than 2 micro op.. */
41271 last_path
41274 /* sched_insn_info defines a window to the instructions scheduled in
41275 the basic block. It contains a pointer to the insn_info table and
41276 the instruction scheduled.
41278 Windows are allocated for each basic block and are linked
41279 together. */
41280 typedef struct sched_insn_info_s {
41281 rtx insn;
41282 enum dispatch_group group;
41283 enum insn_path path;
41284 int byte_len;
41285 int imm_bytes;
41286 } sched_insn_info;
41288 /* Linked list of dispatch windows. This is a two way list of
41289 dispatch windows of a basic block. It contains information about
41290 the number of uops in the window and the total number of
41291 instructions and of bytes in the object code for this dispatch
41292 window. */
41293 typedef struct dispatch_windows_s {
41294 int num_insn; /* Number of insn in the window. */
41295 int num_uops; /* Number of uops in the window. */
41296 int window_size; /* Number of bytes in the window. */
41297 int window_num; /* Window number between 0 or 1. */
41298 int num_imm; /* Number of immediates in an insn. */
41299 int num_imm_32; /* Number of 32 bit immediates in an insn. */
41300 int num_imm_64; /* Number of 64 bit immediates in an insn. */
41301 int imm_size; /* Total immediates in the window. */
41302 int num_loads; /* Total memory loads in the window. */
41303 int num_stores; /* Total memory stores in the window. */
41304 int violation; /* Violation exists in window. */
41305 sched_insn_info *window; /* Pointer to the window. */
41306 struct dispatch_windows_s *next;
41307 struct dispatch_windows_s *prev;
41308 } dispatch_windows;
41310 /* Immediate valuse used in an insn. */
41311 typedef struct imm_info_s
41313 int imm;
41314 int imm32;
41315 int imm64;
41316 } imm_info;
41318 static dispatch_windows *dispatch_window_list;
41319 static dispatch_windows *dispatch_window_list1;
41321 /* Get dispatch group of insn. */
41323 static enum dispatch_group
41324 get_mem_group (rtx insn)
41326 enum attr_memory memory;
41328 if (INSN_CODE (insn) < 0)
41329 return disp_no_group;
41330 memory = get_attr_memory (insn);
41331 if (memory == MEMORY_STORE)
41332 return disp_store;
41334 if (memory == MEMORY_LOAD)
41335 return disp_load;
41337 if (memory == MEMORY_BOTH)
41338 return disp_load_store;
41340 return disp_no_group;
41343 /* Return true if insn is a compare instruction. */
41345 static bool
41346 is_cmp (rtx insn)
41348 enum attr_type type;
41350 type = get_attr_type (insn);
41351 return (type == TYPE_TEST
41352 || type == TYPE_ICMP
41353 || type == TYPE_FCMP
41354 || GET_CODE (PATTERN (insn)) == COMPARE);
41357 /* Return true if a dispatch violation encountered. */
41359 static bool
41360 dispatch_violation (void)
41362 if (dispatch_window_list->next)
41363 return dispatch_window_list->next->violation;
41364 return dispatch_window_list->violation;
41367 /* Return true if insn is a branch instruction. */
41369 static bool
41370 is_branch (rtx insn)
41372 return (CALL_P (insn) || JUMP_P (insn));
41375 /* Return true if insn is a prefetch instruction. */
41377 static bool
41378 is_prefetch (rtx insn)
41380 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
41383 /* This function initializes a dispatch window and the list container holding a
41384 pointer to the window. */
41386 static void
41387 init_window (int window_num)
41389 int i;
41390 dispatch_windows *new_list;
41392 if (window_num == 0)
41393 new_list = dispatch_window_list;
41394 else
41395 new_list = dispatch_window_list1;
41397 new_list->num_insn = 0;
41398 new_list->num_uops = 0;
41399 new_list->window_size = 0;
41400 new_list->next = NULL;
41401 new_list->prev = NULL;
41402 new_list->window_num = window_num;
41403 new_list->num_imm = 0;
41404 new_list->num_imm_32 = 0;
41405 new_list->num_imm_64 = 0;
41406 new_list->imm_size = 0;
41407 new_list->num_loads = 0;
41408 new_list->num_stores = 0;
41409 new_list->violation = false;
41411 for (i = 0; i < MAX_INSN; i++)
41413 new_list->window[i].insn = NULL;
41414 new_list->window[i].group = disp_no_group;
41415 new_list->window[i].path = no_path;
41416 new_list->window[i].byte_len = 0;
41417 new_list->window[i].imm_bytes = 0;
41419 return;
41422 /* This function allocates and initializes a dispatch window and the
41423 list container holding a pointer to the window. */
41425 static dispatch_windows *
41426 allocate_window (void)
41428 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
41429 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
41431 return new_list;
41434 /* This routine initializes the dispatch scheduling information. It
41435 initiates building dispatch scheduler tables and constructs the
41436 first dispatch window. */
41438 static void
41439 init_dispatch_sched (void)
41441 /* Allocate a dispatch list and a window. */
41442 dispatch_window_list = allocate_window ();
41443 dispatch_window_list1 = allocate_window ();
41444 init_window (0);
41445 init_window (1);
41448 /* This function returns true if a branch is detected. End of a basic block
41449 does not have to be a branch, but here we assume only branches end a
41450 window. */
41452 static bool
41453 is_end_basic_block (enum dispatch_group group)
41455 return group == disp_branch;
41458 /* This function is called when the end of a window processing is reached. */
41460 static void
41461 process_end_window (void)
41463 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
41464 if (dispatch_window_list->next)
41466 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
41467 gcc_assert (dispatch_window_list->window_size
41468 + dispatch_window_list1->window_size <= 48);
41469 init_window (1);
41471 init_window (0);
41474 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
41475 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
41476 for 48 bytes of instructions. Note that these windows are not dispatch
41477 windows that their sizes are DISPATCH_WINDOW_SIZE. */
41479 static dispatch_windows *
41480 allocate_next_window (int window_num)
41482 if (window_num == 0)
41484 if (dispatch_window_list->next)
41485 init_window (1);
41486 init_window (0);
41487 return dispatch_window_list;
41490 dispatch_window_list->next = dispatch_window_list1;
41491 dispatch_window_list1->prev = dispatch_window_list;
41493 return dispatch_window_list1;
41496 /* Increment the number of immediate operands of an instruction. */
41498 static int
41499 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
41501 if (*in_rtx == 0)
41502 return 0;
41504 switch ( GET_CODE (*in_rtx))
41506 case CONST:
41507 case SYMBOL_REF:
41508 case CONST_INT:
41509 (imm_values->imm)++;
41510 if (x86_64_immediate_operand (*in_rtx, SImode))
41511 (imm_values->imm32)++;
41512 else
41513 (imm_values->imm64)++;
41514 break;
41516 case CONST_DOUBLE:
41517 (imm_values->imm)++;
41518 (imm_values->imm64)++;
41519 break;
41521 case CODE_LABEL:
41522 if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
41524 (imm_values->imm)++;
41525 (imm_values->imm32)++;
41527 break;
41529 default:
41530 break;
41533 return 0;
41536 /* Compute number of immediate operands of an instruction. */
41538 static void
41539 find_constant (rtx in_rtx, imm_info *imm_values)
41541 for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
41542 (rtx_function) find_constant_1, (void *) imm_values);
41545 /* Return total size of immediate operands of an instruction along with number
41546 of corresponding immediate-operands. It initializes its parameters to zero
41547 befor calling FIND_CONSTANT.
41548 INSN is the input instruction. IMM is the total of immediates.
41549 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
41550 bit immediates. */
41552 static int
41553 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
41555 imm_info imm_values = {0, 0, 0};
41557 find_constant (insn, &imm_values);
41558 *imm = imm_values.imm;
41559 *imm32 = imm_values.imm32;
41560 *imm64 = imm_values.imm64;
41561 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
41564 /* This function indicates if an operand of an instruction is an
41565 immediate. */
41567 static bool
41568 has_immediate (rtx insn)
41570 int num_imm_operand;
41571 int num_imm32_operand;
41572 int num_imm64_operand;
41574 if (insn)
41575 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
41576 &num_imm64_operand);
41577 return false;
41580 /* Return single or double path for instructions. */
41582 static enum insn_path
41583 get_insn_path (rtx insn)
41585 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
41587 if ((int)path == 0)
41588 return path_single;
41590 if ((int)path == 1)
41591 return path_double;
41593 return path_multi;
41596 /* Return insn dispatch group. */
41598 static enum dispatch_group
41599 get_insn_group (rtx insn)
41601 enum dispatch_group group = get_mem_group (insn);
41602 if (group)
41603 return group;
41605 if (is_branch (insn))
41606 return disp_branch;
41608 if (is_cmp (insn))
41609 return disp_cmp;
41611 if (has_immediate (insn))
41612 return disp_imm;
41614 if (is_prefetch (insn))
41615 return disp_prefetch;
41617 return disp_no_group;
41620 /* Count number of GROUP restricted instructions in a dispatch
41621 window WINDOW_LIST. */
41623 static int
41624 count_num_restricted (rtx insn, dispatch_windows *window_list)
41626 enum dispatch_group group = get_insn_group (insn);
41627 int imm_size;
41628 int num_imm_operand;
41629 int num_imm32_operand;
41630 int num_imm64_operand;
41632 if (group == disp_no_group)
41633 return 0;
41635 if (group == disp_imm)
41637 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
41638 &num_imm64_operand);
41639 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
41640 || num_imm_operand + window_list->num_imm > MAX_IMM
41641 || (num_imm32_operand > 0
41642 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
41643 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
41644 || (num_imm64_operand > 0
41645 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
41646 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
41647 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
41648 && num_imm64_operand > 0
41649 && ((window_list->num_imm_64 > 0
41650 && window_list->num_insn >= 2)
41651 || window_list->num_insn >= 3)))
41652 return BIG;
41654 return 1;
41657 if ((group == disp_load_store
41658 && (window_list->num_loads >= MAX_LOAD
41659 || window_list->num_stores >= MAX_STORE))
41660 || ((group == disp_load
41661 || group == disp_prefetch)
41662 && window_list->num_loads >= MAX_LOAD)
41663 || (group == disp_store
41664 && window_list->num_stores >= MAX_STORE))
41665 return BIG;
41667 return 1;
41670 /* This function returns true if insn satisfies dispatch rules on the
41671 last window scheduled. */
41673 static bool
41674 fits_dispatch_window (rtx insn)
41676 dispatch_windows *window_list = dispatch_window_list;
41677 dispatch_windows *window_list_next = dispatch_window_list->next;
41678 unsigned int num_restrict;
41679 enum dispatch_group group = get_insn_group (insn);
41680 enum insn_path path = get_insn_path (insn);
41681 int sum;
41683 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
41684 instructions should be given the lowest priority in the
41685 scheduling process in Haifa scheduler to make sure they will be
41686 scheduled in the same dispatch window as the reference to them. */
41687 if (group == disp_jcc || group == disp_cmp)
41688 return false;
41690 /* Check nonrestricted. */
41691 if (group == disp_no_group || group == disp_branch)
41692 return true;
41694 /* Get last dispatch window. */
41695 if (window_list_next)
41696 window_list = window_list_next;
41698 if (window_list->window_num == 1)
41700 sum = window_list->prev->window_size + window_list->window_size;
41702 if (sum == 32
41703 || (min_insn_size (insn) + sum) >= 48)
41704 /* Window 1 is full. Go for next window. */
41705 return true;
41708 num_restrict = count_num_restricted (insn, window_list);
41710 if (num_restrict > num_allowable_groups[group])
41711 return false;
41713 /* See if it fits in the first window. */
41714 if (window_list->window_num == 0)
41716 /* The first widow should have only single and double path
41717 uops. */
41718 if (path == path_double
41719 && (window_list->num_uops + 2) > MAX_INSN)
41720 return false;
41721 else if (path != path_single)
41722 return false;
41724 return true;
41727 /* Add an instruction INSN with NUM_UOPS micro-operations to the
41728 dispatch window WINDOW_LIST. */
41730 static void
41731 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
41733 int byte_len = min_insn_size (insn);
41734 int num_insn = window_list->num_insn;
41735 int imm_size;
41736 sched_insn_info *window = window_list->window;
41737 enum dispatch_group group = get_insn_group (insn);
41738 enum insn_path path = get_insn_path (insn);
41739 int num_imm_operand;
41740 int num_imm32_operand;
41741 int num_imm64_operand;
41743 if (!window_list->violation && group != disp_cmp
41744 && !fits_dispatch_window (insn))
41745 window_list->violation = true;
41747 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
41748 &num_imm64_operand);
41750 /* Initialize window with new instruction. */
41751 window[num_insn].insn = insn;
41752 window[num_insn].byte_len = byte_len;
41753 window[num_insn].group = group;
41754 window[num_insn].path = path;
41755 window[num_insn].imm_bytes = imm_size;
41757 window_list->window_size += byte_len;
41758 window_list->num_insn = num_insn + 1;
41759 window_list->num_uops = window_list->num_uops + num_uops;
41760 window_list->imm_size += imm_size;
41761 window_list->num_imm += num_imm_operand;
41762 window_list->num_imm_32 += num_imm32_operand;
41763 window_list->num_imm_64 += num_imm64_operand;
41765 if (group == disp_store)
41766 window_list->num_stores += 1;
41767 else if (group == disp_load
41768 || group == disp_prefetch)
41769 window_list->num_loads += 1;
41770 else if (group == disp_load_store)
41772 window_list->num_stores += 1;
41773 window_list->num_loads += 1;
41777 /* Adds a scheduled instruction, INSN, to the current dispatch window.
41778 If the total bytes of instructions or the number of instructions in
41779 the window exceed allowable, it allocates a new window. */
41781 static void
41782 add_to_dispatch_window (rtx insn)
41784 int byte_len;
41785 dispatch_windows *window_list;
41786 dispatch_windows *next_list;
41787 dispatch_windows *window0_list;
41788 enum insn_path path;
41789 enum dispatch_group insn_group;
41790 bool insn_fits;
41791 int num_insn;
41792 int num_uops;
41793 int window_num;
41794 int insn_num_uops;
41795 int sum;
41797 if (INSN_CODE (insn) < 0)
41798 return;
41800 byte_len = min_insn_size (insn);
41801 window_list = dispatch_window_list;
41802 next_list = window_list->next;
41803 path = get_insn_path (insn);
41804 insn_group = get_insn_group (insn);
41806 /* Get the last dispatch window. */
41807 if (next_list)
41808 window_list = dispatch_window_list->next;
41810 if (path == path_single)
41811 insn_num_uops = 1;
41812 else if (path == path_double)
41813 insn_num_uops = 2;
41814 else
41815 insn_num_uops = (int) path;
41817 /* If current window is full, get a new window.
41818 Window number zero is full, if MAX_INSN uops are scheduled in it.
41819 Window number one is full, if window zero's bytes plus window
41820 one's bytes is 32, or if the bytes of the new instruction added
41821 to the total makes it greater than 48, or it has already MAX_INSN
41822 instructions in it. */
41823 num_insn = window_list->num_insn;
41824 num_uops = window_list->num_uops;
41825 window_num = window_list->window_num;
41826 insn_fits = fits_dispatch_window (insn);
41828 if (num_insn >= MAX_INSN
41829 || num_uops + insn_num_uops > MAX_INSN
41830 || !(insn_fits))
41832 window_num = ~window_num & 1;
41833 window_list = allocate_next_window (window_num);
41836 if (window_num == 0)
41838 add_insn_window (insn, window_list, insn_num_uops);
41839 if (window_list->num_insn >= MAX_INSN
41840 && insn_group == disp_branch)
41842 process_end_window ();
41843 return;
41846 else if (window_num == 1)
41848 window0_list = window_list->prev;
41849 sum = window0_list->window_size + window_list->window_size;
41850 if (sum == 32
41851 || (byte_len + sum) >= 48)
41853 process_end_window ();
41854 window_list = dispatch_window_list;
41857 add_insn_window (insn, window_list, insn_num_uops);
41859 else
41860 gcc_unreachable ();
41862 if (is_end_basic_block (insn_group))
41864 /* End of basic block is reached do end-basic-block process. */
41865 process_end_window ();
41866 return;
41870 /* Print the dispatch window, WINDOW_NUM, to FILE. */
41872 DEBUG_FUNCTION static void
41873 debug_dispatch_window_file (FILE *file, int window_num)
41875 dispatch_windows *list;
41876 int i;
41878 if (window_num == 0)
41879 list = dispatch_window_list;
41880 else
41881 list = dispatch_window_list1;
41883 fprintf (file, "Window #%d:\n", list->window_num);
41884 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
41885 list->num_insn, list->num_uops, list->window_size);
41886 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
41887 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
41889 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
41890 list->num_stores);
41891 fprintf (file, " insn info:\n");
41893 for (i = 0; i < MAX_INSN; i++)
41895 if (!list->window[i].insn)
41896 break;
41897 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
41898 i, group_name[list->window[i].group],
41899 i, (void *)list->window[i].insn,
41900 i, list->window[i].path,
41901 i, list->window[i].byte_len,
41902 i, list->window[i].imm_bytes);
41906 /* Print to stdout a dispatch window. */
41908 DEBUG_FUNCTION void
41909 debug_dispatch_window (int window_num)
41911 debug_dispatch_window_file (stdout, window_num);
41914 /* Print INSN dispatch information to FILE. */
41916 DEBUG_FUNCTION static void
41917 debug_insn_dispatch_info_file (FILE *file, rtx insn)
41919 int byte_len;
41920 enum insn_path path;
41921 enum dispatch_group group;
41922 int imm_size;
41923 int num_imm_operand;
41924 int num_imm32_operand;
41925 int num_imm64_operand;
41927 if (INSN_CODE (insn) < 0)
41928 return;
41930 byte_len = min_insn_size (insn);
41931 path = get_insn_path (insn);
41932 group = get_insn_group (insn);
41933 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
41934 &num_imm64_operand);
41936 fprintf (file, " insn info:\n");
41937 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
41938 group_name[group], path, byte_len);
41939 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
41940 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
41943 /* Print to STDERR the status of the ready list with respect to
41944 dispatch windows. */
41946 DEBUG_FUNCTION void
41947 debug_ready_dispatch (void)
41949 int i;
41950 int no_ready = number_in_ready ();
41952 fprintf (stdout, "Number of ready: %d\n", no_ready);
41954 for (i = 0; i < no_ready; i++)
41955 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
41958 /* This routine is the driver of the dispatch scheduler. */
41960 static void
41961 do_dispatch (rtx insn, int mode)
41963 if (mode == DISPATCH_INIT)
41964 init_dispatch_sched ();
41965 else if (mode == ADD_TO_DISPATCH_WINDOW)
41966 add_to_dispatch_window (insn);
41969 /* Return TRUE if Dispatch Scheduling is supported. */
41971 static bool
41972 has_dispatch (rtx insn, int action)
41974 if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3)
41975 && flag_dispatch_scheduler)
41976 switch (action)
41978 default:
41979 return false;
41981 case IS_DISPATCH_ON:
41982 return true;
41983 break;
41985 case IS_CMP:
41986 return is_cmp (insn);
41988 case DISPATCH_VIOLATION:
41989 return dispatch_violation ();
41991 case FITS_DISPATCH_WINDOW:
41992 return fits_dispatch_window (insn);
41995 return false;
41998 /* Implementation of reassociation_width target hook used by
41999 reassoc phase to identify parallelism level in reassociated
42000 tree. Statements tree_code is passed in OPC. Arguments type
42001 is passed in MODE.
42003 Currently parallel reassociation is enabled for Atom
42004 processors only and we set reassociation width to be 2
42005 because Atom may issue up to 2 instructions per cycle.
42007 Return value should be fixed if parallel reassociation is
42008 enabled for other processors. */
42010 static int
42011 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
42012 enum machine_mode mode)
42014 int res = 1;
42016 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
42017 res = 2;
42018 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
42019 res = 2;
42021 return res;
42024 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
42025 place emms and femms instructions. */
42027 static enum machine_mode
42028 ix86_preferred_simd_mode (enum machine_mode mode)
42030 if (!TARGET_SSE)
42031 return word_mode;
42033 switch (mode)
42035 case QImode:
42036 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
42037 case HImode:
42038 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
42039 case SImode:
42040 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
42041 case DImode:
42042 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
42044 case SFmode:
42045 if (TARGET_AVX && !TARGET_PREFER_AVX128)
42046 return V8SFmode;
42047 else
42048 return V4SFmode;
42050 case DFmode:
42051 if (!TARGET_VECTORIZE_DOUBLE)
42052 return word_mode;
42053 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
42054 return V4DFmode;
42055 else if (TARGET_SSE2)
42056 return V2DFmode;
42057 /* FALLTHRU */
42059 default:
42060 return word_mode;
42064 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
42065 vectors. */
42067 static unsigned int
42068 ix86_autovectorize_vector_sizes (void)
42070 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
42075 /* Return class of registers which could be used for pseudo of MODE
42076 and of class RCLASS for spilling instead of memory. Return NO_REGS
42077 if it is not possible or non-profitable. */
42078 static reg_class_t
42079 ix86_spill_class (reg_class_t rclass, enum machine_mode mode)
42081 if (TARGET_SSE && TARGET_GENERAL_REGS_SSE_SPILL && ! TARGET_MMX
42082 && (mode == SImode || (TARGET_64BIT && mode == DImode))
42083 && INTEGER_CLASS_P (rclass))
42084 return SSE_REGS;
42085 return NO_REGS;
42088 /* Implement targetm.vectorize.init_cost. */
42090 static void *
42091 ix86_init_cost (struct loop *loop_info ATTRIBUTE_UNUSED)
42093 unsigned *cost = XNEWVEC (unsigned, 3);
42094 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
42095 return cost;
42098 /* Implement targetm.vectorize.add_stmt_cost. */
42100 static unsigned
42101 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
42102 struct _stmt_vec_info *stmt_info, int misalign,
42103 enum vect_cost_model_location where)
42105 unsigned *cost = (unsigned *) data;
42106 unsigned retval = 0;
42108 if (flag_vect_cost_model)
42110 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
42111 int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
42113 /* Statements in an inner loop relative to the loop being
42114 vectorized are weighted more heavily. The value here is
42115 arbitrary and could potentially be improved with analysis. */
42116 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
42117 count *= 50; /* FIXME. */
42119 retval = (unsigned) (count * stmt_cost);
42120 cost[where] += retval;
42123 return retval;
42126 /* Implement targetm.vectorize.finish_cost. */
42128 static void
42129 ix86_finish_cost (void *data, unsigned *prologue_cost,
42130 unsigned *body_cost, unsigned *epilogue_cost)
42132 unsigned *cost = (unsigned *) data;
42133 *prologue_cost = cost[vect_prologue];
42134 *body_cost = cost[vect_body];
42135 *epilogue_cost = cost[vect_epilogue];
42138 /* Implement targetm.vectorize.destroy_cost_data. */
42140 static void
42141 ix86_destroy_cost_data (void *data)
42143 free (data);
42146 /* Validate target specific memory model bits in VAL. */
42148 static unsigned HOST_WIDE_INT
42149 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
42151 unsigned HOST_WIDE_INT model = val & MEMMODEL_MASK;
42152 bool strong;
42154 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
42155 |MEMMODEL_MASK)
42156 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
42158 warning (OPT_Winvalid_memory_model,
42159 "Unknown architecture specific memory model");
42160 return MEMMODEL_SEQ_CST;
42162 strong = (model == MEMMODEL_ACQ_REL || model == MEMMODEL_SEQ_CST);
42163 if (val & IX86_HLE_ACQUIRE && !(model == MEMMODEL_ACQUIRE || strong))
42165 warning (OPT_Winvalid_memory_model,
42166 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
42167 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
42169 if (val & IX86_HLE_RELEASE && !(model == MEMMODEL_RELEASE || strong))
42171 warning (OPT_Winvalid_memory_model,
42172 "HLE_RELEASE not used with RELEASE or stronger memory model");
42173 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
42175 return val;
42178 /* Initialize the GCC target structure. */
42179 #undef TARGET_RETURN_IN_MEMORY
42180 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
42182 #undef TARGET_LEGITIMIZE_ADDRESS
42183 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
42185 #undef TARGET_ATTRIBUTE_TABLE
42186 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
42187 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
42188 # undef TARGET_MERGE_DECL_ATTRIBUTES
42189 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
42190 #endif
42192 #undef TARGET_COMP_TYPE_ATTRIBUTES
42193 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
42195 #undef TARGET_INIT_BUILTINS
42196 #define TARGET_INIT_BUILTINS ix86_init_builtins
42197 #undef TARGET_BUILTIN_DECL
42198 #define TARGET_BUILTIN_DECL ix86_builtin_decl
42199 #undef TARGET_EXPAND_BUILTIN
42200 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
42202 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
42203 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
42204 ix86_builtin_vectorized_function
42206 #undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
42207 #define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
42209 #undef TARGET_VECTORIZE_BUILTIN_TM_STORE
42210 #define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
42212 #undef TARGET_VECTORIZE_BUILTIN_GATHER
42213 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
42215 #undef TARGET_BUILTIN_RECIPROCAL
42216 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
42218 #undef TARGET_ASM_FUNCTION_EPILOGUE
42219 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
42221 #undef TARGET_ENCODE_SECTION_INFO
42222 #ifndef SUBTARGET_ENCODE_SECTION_INFO
42223 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
42224 #else
42225 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
42226 #endif
42228 #undef TARGET_ASM_OPEN_PAREN
42229 #define TARGET_ASM_OPEN_PAREN ""
42230 #undef TARGET_ASM_CLOSE_PAREN
42231 #define TARGET_ASM_CLOSE_PAREN ""
42233 #undef TARGET_ASM_BYTE_OP
42234 #define TARGET_ASM_BYTE_OP ASM_BYTE
42236 #undef TARGET_ASM_ALIGNED_HI_OP
42237 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
42238 #undef TARGET_ASM_ALIGNED_SI_OP
42239 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
42240 #ifdef ASM_QUAD
42241 #undef TARGET_ASM_ALIGNED_DI_OP
42242 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
42243 #endif
42245 #undef TARGET_PROFILE_BEFORE_PROLOGUE
42246 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
42248 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
42249 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
42251 #undef TARGET_ASM_UNALIGNED_HI_OP
42252 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
42253 #undef TARGET_ASM_UNALIGNED_SI_OP
42254 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
42255 #undef TARGET_ASM_UNALIGNED_DI_OP
42256 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
42258 #undef TARGET_PRINT_OPERAND
42259 #define TARGET_PRINT_OPERAND ix86_print_operand
42260 #undef TARGET_PRINT_OPERAND_ADDRESS
42261 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
42262 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
42263 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
42264 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
42265 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
42267 #undef TARGET_SCHED_INIT_GLOBAL
42268 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
42269 #undef TARGET_SCHED_ADJUST_COST
42270 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
42271 #undef TARGET_SCHED_ISSUE_RATE
42272 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
42273 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
42274 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
42275 ia32_multipass_dfa_lookahead
42277 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
42278 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
42280 #undef TARGET_MEMMODEL_CHECK
42281 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
42283 #ifdef HAVE_AS_TLS
42284 #undef TARGET_HAVE_TLS
42285 #define TARGET_HAVE_TLS true
42286 #endif
42287 #undef TARGET_CANNOT_FORCE_CONST_MEM
42288 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
42289 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
42290 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
42292 #undef TARGET_DELEGITIMIZE_ADDRESS
42293 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
42295 #undef TARGET_MS_BITFIELD_LAYOUT_P
42296 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
42298 #if TARGET_MACHO
42299 #undef TARGET_BINDS_LOCAL_P
42300 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
42301 #endif
42302 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
42303 #undef TARGET_BINDS_LOCAL_P
42304 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
42305 #endif
42307 #undef TARGET_ASM_OUTPUT_MI_THUNK
42308 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
42309 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
42310 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
42312 #undef TARGET_ASM_FILE_START
42313 #define TARGET_ASM_FILE_START x86_file_start
42315 #undef TARGET_OPTION_OVERRIDE
42316 #define TARGET_OPTION_OVERRIDE ix86_option_override
42318 #undef TARGET_REGISTER_MOVE_COST
42319 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
42320 #undef TARGET_MEMORY_MOVE_COST
42321 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
42322 #undef TARGET_RTX_COSTS
42323 #define TARGET_RTX_COSTS ix86_rtx_costs
42324 #undef TARGET_ADDRESS_COST
42325 #define TARGET_ADDRESS_COST ix86_address_cost
42327 #undef TARGET_FIXED_CONDITION_CODE_REGS
42328 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
42329 #undef TARGET_CC_MODES_COMPATIBLE
42330 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
42332 #undef TARGET_MACHINE_DEPENDENT_REORG
42333 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
42335 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
42336 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
42338 #undef TARGET_BUILD_BUILTIN_VA_LIST
42339 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
42341 #undef TARGET_FOLD_BUILTIN
42342 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
42344 #undef TARGET_COMPARE_VERSION_PRIORITY
42345 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
42347 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
42348 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
42349 ix86_generate_version_dispatcher_body
42351 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
42352 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
42353 ix86_get_function_versions_dispatcher
42355 #undef TARGET_ENUM_VA_LIST_P
42356 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
42358 #undef TARGET_FN_ABI_VA_LIST
42359 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
42361 #undef TARGET_CANONICAL_VA_LIST_TYPE
42362 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
42364 #undef TARGET_EXPAND_BUILTIN_VA_START
42365 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
42367 #undef TARGET_MD_ASM_CLOBBERS
42368 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
42370 #undef TARGET_PROMOTE_PROTOTYPES
42371 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
42372 #undef TARGET_STRUCT_VALUE_RTX
42373 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
42374 #undef TARGET_SETUP_INCOMING_VARARGS
42375 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
42376 #undef TARGET_MUST_PASS_IN_STACK
42377 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
42378 #undef TARGET_FUNCTION_ARG_ADVANCE
42379 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
42380 #undef TARGET_FUNCTION_ARG
42381 #define TARGET_FUNCTION_ARG ix86_function_arg
42382 #undef TARGET_FUNCTION_ARG_BOUNDARY
42383 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
42384 #undef TARGET_PASS_BY_REFERENCE
42385 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
42386 #undef TARGET_INTERNAL_ARG_POINTER
42387 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
42388 #undef TARGET_UPDATE_STACK_BOUNDARY
42389 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
42390 #undef TARGET_GET_DRAP_RTX
42391 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
42392 #undef TARGET_STRICT_ARGUMENT_NAMING
42393 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
42394 #undef TARGET_STATIC_CHAIN
42395 #define TARGET_STATIC_CHAIN ix86_static_chain
42396 #undef TARGET_TRAMPOLINE_INIT
42397 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
42398 #undef TARGET_RETURN_POPS_ARGS
42399 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
42401 #undef TARGET_LEGITIMATE_COMBINED_INSN
42402 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
42404 #undef TARGET_ASAN_SHADOW_OFFSET
42405 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
42407 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
42408 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
42410 #undef TARGET_SCALAR_MODE_SUPPORTED_P
42411 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
42413 #undef TARGET_VECTOR_MODE_SUPPORTED_P
42414 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
42416 #undef TARGET_C_MODE_FOR_SUFFIX
42417 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
42419 #ifdef HAVE_AS_TLS
42420 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
42421 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
42422 #endif
42424 #ifdef SUBTARGET_INSERT_ATTRIBUTES
42425 #undef TARGET_INSERT_ATTRIBUTES
42426 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
42427 #endif
42429 #undef TARGET_MANGLE_TYPE
42430 #define TARGET_MANGLE_TYPE ix86_mangle_type
42432 #if !TARGET_MACHO
42433 #undef TARGET_STACK_PROTECT_FAIL
42434 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
42435 #endif
42437 #undef TARGET_FUNCTION_VALUE
42438 #define TARGET_FUNCTION_VALUE ix86_function_value
42440 #undef TARGET_FUNCTION_VALUE_REGNO_P
42441 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
42443 #undef TARGET_PROMOTE_FUNCTION_MODE
42444 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
42446 #undef TARGET_MEMBER_TYPE_FORCES_BLK
42447 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
42449 #undef TARGET_INSTANTIATE_DECLS
42450 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
42452 #undef TARGET_SECONDARY_RELOAD
42453 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
42455 #undef TARGET_CLASS_MAX_NREGS
42456 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
42458 #undef TARGET_PREFERRED_RELOAD_CLASS
42459 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
42460 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
42461 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
42462 #undef TARGET_CLASS_LIKELY_SPILLED_P
42463 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
42465 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
42466 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
42467 ix86_builtin_vectorization_cost
42468 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
42469 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
42470 ix86_vectorize_vec_perm_const_ok
42471 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
42472 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
42473 ix86_preferred_simd_mode
42474 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
42475 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
42476 ix86_autovectorize_vector_sizes
42477 #undef TARGET_VECTORIZE_INIT_COST
42478 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
42479 #undef TARGET_VECTORIZE_ADD_STMT_COST
42480 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
42481 #undef TARGET_VECTORIZE_FINISH_COST
42482 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
42483 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
42484 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
42486 #undef TARGET_SET_CURRENT_FUNCTION
42487 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
42489 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
42490 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
42492 #undef TARGET_OPTION_SAVE
42493 #define TARGET_OPTION_SAVE ix86_function_specific_save
42495 #undef TARGET_OPTION_RESTORE
42496 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
42498 #undef TARGET_OPTION_PRINT
42499 #define TARGET_OPTION_PRINT ix86_function_specific_print
42501 #undef TARGET_OPTION_FUNCTION_VERSIONS
42502 #define TARGET_OPTION_FUNCTION_VERSIONS ix86_function_versions
42504 #undef TARGET_CAN_INLINE_P
42505 #define TARGET_CAN_INLINE_P ix86_can_inline_p
42507 #undef TARGET_EXPAND_TO_RTL_HOOK
42508 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
42510 #undef TARGET_LEGITIMATE_ADDRESS_P
42511 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
42513 #undef TARGET_LRA_P
42514 #define TARGET_LRA_P hook_bool_void_true
42516 #undef TARGET_REGISTER_PRIORITY
42517 #define TARGET_REGISTER_PRIORITY ix86_register_priority
42519 #undef TARGET_LEGITIMATE_CONSTANT_P
42520 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
42522 #undef TARGET_FRAME_POINTER_REQUIRED
42523 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
42525 #undef TARGET_CAN_ELIMINATE
42526 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
42528 #undef TARGET_EXTRA_LIVE_ON_ENTRY
42529 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
42531 #undef TARGET_ASM_CODE_END
42532 #define TARGET_ASM_CODE_END ix86_code_end
42534 #undef TARGET_CONDITIONAL_REGISTER_USAGE
42535 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
42537 #if TARGET_MACHO
42538 #undef TARGET_INIT_LIBFUNCS
42539 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
42540 #endif
42542 #undef TARGET_SPILL_CLASS
42543 #define TARGET_SPILL_CLASS ix86_spill_class
42545 struct gcc_target targetm = TARGET_INITIALIZER;
42547 #include "gt-i386.h"